diff --git a/4-GPU/HandsOn/C/.clang-format b/4-GPU/HandsOn/C/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..c38075d9b581125672afffab80d5b7bed31baa7a
--- /dev/null
+++ b/4-GPU/HandsOn/C/.clang-format
@@ -0,0 +1,148 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:   
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: true
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:   
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Preserve
+IncludeCategories: 
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats: 
+  - Language:        Cpp
+    Delimiters:      
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+  - Language:        TextProto
+    Delimiters:      
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions: 
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Auto
+TabWidth:        8
+UseTab:          Never
+...
+
diff --git a/4-GPU/HandsOn/C/task0/Makefile b/4-GPU/HandsOn/C/task0/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..e3265d91d53c6ff782546d7ba7011e06ba661617
--- /dev/null
+++ b/4-GPU/HandsOn/C/task0/Makefile
@@ -0,0 +1,49 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = pgcc
+CFLAGS = -DUSE_DOUBLE
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI)
+	CFLAGS += -fast
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,managed
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+
+TASK=0
+NP ?= 1
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.c poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.solution.pgprof poisson2d.pgprof
+
+run: poisson2d
+	${SC19_SUBMIT_CMD} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD} ${PGPROF}  -f -o ${SC19_DIR_SCRATCH}/poisson2d.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.pgprof .
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD} ${PGPROF}  -o ${SC19_DIR_SCRATCH}/poisson2d.solution.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.pgprof .
\ No newline at end of file
diff --git a/4-GPU/HandsOn/C/task0/common.h b/4-GPU/HandsOn/C/task0/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1d1efa0c24854c242049dba3633d0e4001a09fd
--- /dev/null
+++ b/4-GPU/HandsOn/C/task0/common.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+typedef struct
+{
+    int y;
+    int x;
+} dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+double get_time();
+
+void poisson2d_serial( int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/C/task0/poisson2d.c b/4-GPU/HandsOn/C/task0/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..e0eed22fa624796c68fc23f3285db5b14ab0ab86
--- /dev/null
+++ b/4-GPU/HandsOn/C/task0/poisson2d.c
@@ -0,0 +1,137 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+int main(int argc, char** argv) {
+    int ny = 2048;
+    int nx = 2048;
+    int iter_max = 500;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    printf("Calculate reference solution and time serial CPU execution.\n");
+    double start = get_time();
+    poisson2d_serial(iter_max, tol, Aref, Anew, nx, ny, rhs);
+    double runtime_cpu = get_time() - start;
+
+    printf("GPU execution.\n");
+    start = get_time();
+    int iter = 0;
+    real error = 1.0;
+
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+        // TODO: Parallelize loop nest with OpenACC
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        // TODO: Parallelize loop nest with OpenACC
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        // TODO: Parallelize loop nest with OpenACC
+        for (int ix = ix_start; ix < ix_end; ix++) {
+            A[0 * nx + ix] = A[(ny - 2) * nx + ix];
+            A[(ny - 1) * nx + ix] = A[1 * nx + ix];
+        }
+        // TODO: Parallelize loop nest with OpenACC
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+    double runtime = get_time() - start;
+
+    int errors = 0;
+    if (check_results(ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        printf("%dx%d: 1 CPU: %8.4f s, 1 GPU: %8.4f s, speedup: %8.2f\n", ny, nx, runtime_cpu,
+               runtime, runtime_cpu / runtime);
+    } else {
+        errors = -1;
+    }
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+
+    return errors;
+}
diff --git a/4-GPU/HandsOn/C/task0/poisson2d_serial.c b/4-GPU/HandsOn/C/task0/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..397097da0150e29eb9efa5b598a1fea57009435b
--- /dev/null
+++ b/4-GPU/HandsOn/C/task0/poisson2d_serial.c
@@ -0,0 +1,92 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "common.h"
+
+double get_time() {
+    struct timeval tv;
+    struct timezone tz;
+    gettimeofday(&tv, &tz);
+    return 1.0 * tv.tv_sec + 1.0E-6 * tv.tv_usec;
+}
+
+void poisson2d_serial(int iter_max, real tol, real* restrict const Aref, real* restrict const Anew,
+                      int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+        for (int iy = 1; iy < ny - 1; iy++) {
+            for (int ix = 1; ix < nx - 1; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 *
+                    (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                          Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+            }
+        }
+
+        for (int iy = 1; iy < ny - 1; iy++) {
+            for (int ix = 1; ix < nx - 1; ix++) {
+                Aref[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        for (int ix = 1; ix < nx - 1; ix++) {
+            Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+            Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+        }
+        for (int iy = 1; iy < ny - 1; iy++) {
+            Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+            Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+        }
+
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+}
+
+int check_results(int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "ERROR: A[%d][%d] = %f does not match %f (reference)\n", iy, ix,
+                        A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/C/task1/Makefile b/4-GPU/HandsOn/C/task1/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..ba7aed9c3758168db58f7fe9213c5fa847acc4c2
--- /dev/null
+++ b/4-GPU/HandsOn/C/task1/Makefile
@@ -0,0 +1,58 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = pgcc
+CFLAGS = -DUSE_DOUBLE
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI)
+	CFLAGS += -fast
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,managed,lineinfo
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+
+TASK=1
+NP ?= 1
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.c poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.solution.*.pgprof poisson2d.*.pgprof *.tar.gz *.csv
+
+run: poisson2d
+	${SC19_SUBMIT_CMD} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.timeline.pgprof ./poisson2d 3
+	${SC19_SUBMIT_CMD} ${PGPROF} --analysis-metrics -o ${SC19_DIR_SCRATCH}/poisson2d.metrics.pgprof ./poisson2d 3
+	${SC19_SUBMIT_CMD} ${PGPROF}  --metrics gld_efficiency,gst_efficiency -o ${SC19_DIR_SCRATCH}/poisson2d.efficiency.pgprof ./poisson2d 3
+	pgprof --csv -i ${SC19_DIR_SCRATCH}/poisson2d.efficiency.pgprof 2>&1 | grep -v "======" > poisson2d.efficiency.csv
+	mv ${SC19_DIR_SCRATCH}/poisson2d.*.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.timeline.pgprof poisson2d.metrics.pgprof
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+		${SC19_SUBMIT_CMD} ${PGPROF}  -o ${SC19_DIR_SCRATCH}/poisson2d.solution.timeline.pgprof ./poisson2d.solution 3
+		${SC19_SUBMIT_CMD} ${PGPROF}  --analysis-metrics -o ${SC19_DIR_SCRATCH}/poisson2d.solution.metrics.pgprof ./poisson2d.solution 3
+		${SC19_SUBMIT_CMD} ${PGPROF}  --metrics gld_efficiency,gst_efficiency -o ${SC19_DIR_SCRATCH}/poisson2d.solution.efficiency.pgprof ./poisson2d.solution 3
+		pgprof --csv -i ${SC19_DIR_SCRATCH}/poisson2d.solution.efficiency.pgprof 2>&1 | grep -v "======" > poisson2d.solution.efficiency.csv
+		mv ${SC19_DIR_SCRATCH}/poisson2d.solution.*.pgprof .
+		tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz  poisson2d.solution.*.pgprof
+
diff --git a/4-GPU/HandsOn/C/task1/common.h b/4-GPU/HandsOn/C/task1/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1d1efa0c24854c242049dba3633d0e4001a09fd
--- /dev/null
+++ b/4-GPU/HandsOn/C/task1/common.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+typedef struct
+{
+    int y;
+    int x;
+} dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+double get_time();
+
+void poisson2d_serial( int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/C/task1/poisson2d.c b/4-GPU/HandsOn/C/task1/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..055b318854c0b29a2bb89c00f19e795dda30dae1
--- /dev/null
+++ b/4-GPU/HandsOn/C/task1/poisson2d.c
@@ -0,0 +1,139 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+int main(int argc, char** argv) {
+    int ny = 2048;
+    int nx = 2048;
+    int iter_max = 500;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    printf("Calculate reference solution and time serial CPU execution.\n");
+    double start = get_time();
+    poisson2d_serial(iter_max, tol, Aref, Anew, nx, ny, rhs);
+    double runtime_cpu = get_time() - start;
+
+    printf("GPU execution.\n");
+    start = get_time();
+    int iter = 0;
+    real error = 1.0;
+
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+#pragma acc parallel loop
+        for (int ix = ix_start; ix < ix_end; ix++) {
+#pragma acc loop
+            for (int iy = iy_start; iy < iy_end; iy++) {
+                // TODO: Fix memory access pattern
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+#pragma acc parallel loop
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+// Periodic boundary conditions
+#pragma acc parallel loop
+        for (int ix = ix_start; ix < ix_end; ix++) {
+            A[0 * nx + ix] = A[(ny - 2) * nx + ix];
+            A[(ny - 1) * nx + ix] = A[1 * nx + ix];
+        }
+#pragma acc parallel loop
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+    double runtime = get_time() - start;
+
+    int errors = 0;
+    if (check_results(ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        printf("%dx%d: 1 CPU: %8.4f s, 1 GPU: %8.4f s, speedup: %8.2f\n", ny, nx, runtime_cpu,
+               runtime, runtime_cpu / runtime);
+    } else {
+        errors = -1;
+    }
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+
+    return errors;
+}
diff --git a/4-GPU/HandsOn/C/task1/poisson2d_serial.c b/4-GPU/HandsOn/C/task1/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..397097da0150e29eb9efa5b598a1fea57009435b
--- /dev/null
+++ b/4-GPU/HandsOn/C/task1/poisson2d_serial.c
@@ -0,0 +1,92 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "common.h"
+
+double get_time() {
+    struct timeval tv;
+    struct timezone tz;
+    gettimeofday(&tv, &tz);
+    return 1.0 * tv.tv_sec + 1.0E-6 * tv.tv_usec;
+}
+
+void poisson2d_serial(int iter_max, real tol, real* restrict const Aref, real* restrict const Anew,
+                      int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+        for (int iy = 1; iy < ny - 1; iy++) {
+            for (int ix = 1; ix < nx - 1; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 *
+                    (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                          Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+            }
+        }
+
+        for (int iy = 1; iy < ny - 1; iy++) {
+            for (int ix = 1; ix < nx - 1; ix++) {
+                Aref[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        for (int ix = 1; ix < nx - 1; ix++) {
+            Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+            Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+        }
+        for (int iy = 1; iy < ny - 1; iy++) {
+            Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+            Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+        }
+
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+}
+
+int check_results(int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "ERROR: A[%d][%d] = %f does not match %f (reference)\n", iy, ix,
+                        A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/C/task2/Makefile b/4-GPU/HandsOn/C/task2/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..22eb6567c31e7f2863a4852c84a93522e1c4ac5a
--- /dev/null
+++ b/4-GPU/HandsOn/C/task2/Makefile
@@ -0,0 +1,50 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = mpicc
+CFLAGS = -DUSE_DOUBLE
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi
+
+TASK=2
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.c poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof
\ No newline at end of file
diff --git a/4-GPU/HandsOn/C/task2/common.h b/4-GPU/HandsOn/C/task2/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..06d65fb3272fa9baa8a8f74e3d8208b76c0f19c8
--- /dev/null
+++ b/4-GPU/HandsOn/C/task2/common.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+typedef struct
+{
+    int y;
+    int x;
+} dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+void poisson2d_serial( int rank, int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int rank, int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/C/task2/poisson2d.c b/4-GPU/HandsOn/C/task2/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..c3a06bdbad6c34c3db78bf1ffe64d2ded4884fe2
--- /dev/null
+++ b/4-GPU/HandsOn/C/task2/poisson2d.c
@@ -0,0 +1,223 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <mpi.h>
+
+#include "common.h"
+
+int main(int argc, char** argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    int rank = 0;
+    int size = 1;
+
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // TODO: handle device affinity
+
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    // TODO: set first and last row to be processed by this rank.
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    // MPI Warm-up to establish CUDA IPC connections
+    for (int i = 0; i < 2; ++i) {
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
+        {
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        }
+    }
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+
+#pragma acc parallel loop present(A, Anew)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+// Periodic boundary conditions
+// TODO: Handle top/bottom periodic boundary conditions and halo exchange with MPI
+#pragma acc parallel loop present(A)
+        for (int ix = 1; ix < nx - 1; ix++) {
+            A[0 * nx + ix] = A[(ny - 2) * nx + ix];
+            A[(ny - 1) * nx + ix] = A[1 * nx + ix];
+        }
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+        // TODO: Pass device ptr of A to MPI using host_data use_device
+        {
+            double start_mpi = MPI_Wtime();
+            // TODO: 1. Sent row iy_start (first modified row) to top receive lower boundary
+            // (iy_end) from bottom
+
+            // MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_REAL_TYPE,
+            // int dest, 0, void *recvbuf, int recvcount, MPI_REAL_TYPE, int source, 0,
+            // MPI_COMM_WORLD, MPI_STATUS_IGNORE );
+
+            // TODO: 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+
+            // MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_REAL_TYPE,
+            // int dest, 0, void *recvbuf, int recvcount, MPI_REAL_TYPE, int source, 0,
+            // MPI_COMM_WORLD, MPI_STATUS_IGNORE );
+            mpi_time += MPI_Wtime() - start_mpi;
+        }
+
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
+        }
+    } else {
+        errors = -1;
+    }
+
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    return errors;
+}
diff --git a/4-GPU/HandsOn/C/task2/poisson2d_serial.c b/4-GPU/HandsOn/C/task2/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..c0229e7530bd3fc021baafe0a744506640b34e67
--- /dev/null
+++ b/4-GPU/HandsOn/C/task2/poisson2d_serial.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+
+#include "common.h"
+
+void poisson2d_serial(int rank, int iter_max, real tol, real* restrict const Aref,
+                      real* restrict const Anew, int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+#pragma acc data present(Aref, Anew, rhs)
+    {
+#pragma acc update device(Aref [0:nx * ny], rhs [0:nx * ny])
+        while (error > tol && iter < iter_max) {
+            error = 0.0;
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < nx - 1; ix++) {
+                    Anew[iy * nx + ix] =
+                        -0.25 *
+                        (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                              Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                    error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+                }
+            }
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < nx - 1; ix++) {
+                    Aref[iy * nx + ix] = Anew[iy * nx + ix];
+                }
+            }
+
+// Periodic boundary conditions
+#pragma acc parallel loop
+            for (int ix = 1; ix < nx - 1; ix++) {
+                Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+                Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+            }
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+                Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+            }
+
+            if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+            iter++;
+        }
+#pragma acc update self(Aref [0:nx * ny])
+    }
+}
+
+int check_results(int rank, int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "[MPI%d] ERROR: A[%d][%d] = %f does not match %f (reference)\n",
+                        rank, iy, ix, A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+#ifdef MPI_VERSION
+    int global_result_correct = 0;
+    MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    result_correct = global_result_correct;
+#endif  // MPI_VERSION
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/C/task3/Makefile b/4-GPU/HandsOn/C/task3/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..c0e62d7691a752cb192f680238e105ad17c39c19
--- /dev/null
+++ b/4-GPU/HandsOn/C/task3/Makefile
@@ -0,0 +1,50 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = mpicc
+CFLAGS = -DUSE_DOUBLE
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi
+
+TASK=3
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.c poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof
\ No newline at end of file
diff --git a/4-GPU/HandsOn/C/task3/common.h b/4-GPU/HandsOn/C/task3/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..06d65fb3272fa9baa8a8f74e3d8208b76c0f19c8
--- /dev/null
+++ b/4-GPU/HandsOn/C/task3/common.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+typedef struct
+{
+    int y;
+    int x;
+} dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+void poisson2d_serial( int rank, int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int rank, int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/C/task3/poisson2d.c b/4-GPU/HandsOn/C/task3/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..6d62b6a3ca6c5124828687b9d6bf8718bf2c1550
--- /dev/null
+++ b/4-GPU/HandsOn/C/task3/poisson2d.c
@@ -0,0 +1,224 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <mpi.h>
+
+#include "common.h"
+
+int main(int argc, char** argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    int rank = 0;
+    int size = 1;
+
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+#pragma acc set device_num(rank)
+
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    // Ensure correctness if ny%size != 0
+    int chunk_size = ceil((1.0 * ny) / size);
+
+    int iy_start = rank * chunk_size;
+    int iy_end = iy_start + chunk_size;
+
+    // Do not process boundaries
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    // MPI Warm-up to establish CUDA IPC connections
+    for (int i = 0; i < 2; ++i) {
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
+        {
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        }
+    }
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+
+// TODO: Split into halo and bulk part
+#pragma acc parallel loop present(A, Anew)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+        // TODO: Start bulk part asynchronously
+
+        // Periodic boundary conditions
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
+        {
+            double start_mpi = MPI_Wtime();
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            mpi_time += MPI_Wtime() - start_mpi;
+        }
+        // TODO: wait for bulk part
+
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
+        }
+    } else {
+        errors = -1;
+    }
+
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    return errors;
+}
diff --git a/4-GPU/HandsOn/C/task3/poisson2d_serial.c b/4-GPU/HandsOn/C/task3/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..c0229e7530bd3fc021baafe0a744506640b34e67
--- /dev/null
+++ b/4-GPU/HandsOn/C/task3/poisson2d_serial.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+
+#include "common.h"
+
+void poisson2d_serial(int rank, int iter_max, real tol, real* restrict const Aref,
+                      real* restrict const Anew, int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+#pragma acc data present(Aref, Anew, rhs)
+    {
+#pragma acc update device(Aref [0:nx * ny], rhs [0:nx * ny])
+        while (error > tol && iter < iter_max) {
+            error = 0.0;
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < nx - 1; ix++) {
+                    Anew[iy * nx + ix] =
+                        -0.25 *
+                        (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                              Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                    error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+                }
+            }
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < nx - 1; ix++) {
+                    Aref[iy * nx + ix] = Anew[iy * nx + ix];
+                }
+            }
+
+// Periodic boundary conditions
+#pragma acc parallel loop
+            for (int ix = 1; ix < nx - 1; ix++) {
+                Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+                Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+            }
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+                Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+            }
+
+            if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+            iter++;
+        }
+#pragma acc update self(Aref [0:nx * ny])
+    }
+}
+
+int check_results(int rank, int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "[MPI%d] ERROR: A[%d][%d] = %f does not match %f (reference)\n",
+                        rank, iy, ix, A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+#ifdef MPI_VERSION
+    int global_result_correct = 0;
+    MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    result_correct = global_result_correct;
+#endif  // MPI_VERSION
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/C/task4/Makefile b/4-GPU/HandsOn/C/task4/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..5fac788bd850665aaee74b207efc944bc47ae7dc
--- /dev/null
+++ b/4-GPU/HandsOn/C/task4/Makefile
@@ -0,0 +1,55 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = mpicxx
+CFLAGS = -DUSE_DOUBLE 
+
+#NVSHMEM_HOME=${HOME}/nvshmem-master/build
+NVSHMEM_LIBS= -L${NVSHMEM_HOME}/lib -lnvshmem -Mcuda -lcuda -lrt 
+NVSHMEM_INC = -I${NVSHMEM_HOME}/include
+
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi
+
+TASK=4
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) ${NVSHMEM_INC} poisson2d.c poisson2d_serial.o -o poisson2d ${NVSHMEM_LIBS}
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) ${NVSHMEM_INC} poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution ${NVSHMEM_LIBS}
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof
diff --git a/4-GPU/HandsOn/C/task4/common.h b/4-GPU/HandsOn/C/task4/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..381bf32b1432f32332420c8be8ac8d364d8c02bf
--- /dev/null
+++ b/4-GPU/HandsOn/C/task4/common.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+        typedef struct {
+    int y;
+    int x;
+    } dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+void poisson2d_serial( int rank, int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int rank, int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/C/task4/poisson2d.c b/4-GPU/HandsOn/C/task4/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..2d11e0fa26c7af5aac4f1fbd33b3d7465f8d9b80
--- /dev/null
+++ b/4-GPU/HandsOn/C/task4/poisson2d.c
@@ -0,0 +1,245 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <mpi.h>
+#include <openacc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+// TODO: Inlucde necessary headers for NVSHMEM
+
+// Helper function to map existing device allocation to host allocation for NVSHMEM
+void map(real* restrict harr, real* restrict darr, int size) { acc_map_data(harr, darr, size); }
+
+int main(int argc, char** argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    int rank = 0;
+    int size = 1;
+
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // TODO: Initialize NVSHMEM with MPI using nvshmemx_init_attr
+
+#pragma acc set device_num(rank)
+
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
+    // TODO: Allocate symmetric device memory for A
+    // real *d_A = ...
+
+    // TODO: For OpenACC we need to map it to A and Anew so that OpenACC knows we already allocated
+    // device memory for A and Anew
+    // You can use the helper function map(...) above or us acc_map_data directly
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    // Ensure correctness if ny%size != 0
+    int chunk_size = ceil((1.0 * ny) / size);
+
+    int iy_start = rank * chunk_size;
+    int iy_end = iy_start + chunk_size;
+
+    // Do not process boundaries
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    // TODO: Warming up MPI / CUDA IPC is not needed with NVSHMEM - remove that part
+    // MPI Warm-up to establish CUDA IPC connections
+    for (int i = 0; i < 2; ++i) {
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
+        {
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        }
+    }
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+
+#pragma acc parallel loop present(A, Anew)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
+        {
+            double start_mpi = MPI_Wtime();
+
+            // TODO: Replace both MPI calls with ons-sided nvshmem_<type>_put
+            // make sure to put data in the right location on the remote side
+
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // TODO: Add a barrier to make sure data had arrived from remote
+
+            mpi_time += MPI_Wtime() - start_mpi;
+        }
+
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
+        }
+    } else {
+        errors = -1;
+    }
+
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    // TODO: free shmem memory
+
+    return errors;
+}
diff --git a/4-GPU/HandsOn/C/task4/poisson2d_serial.c b/4-GPU/HandsOn/C/task4/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab9a1f2ca159e1af0361f194c39e7231a1e0aa19
--- /dev/null
+++ b/4-GPU/HandsOn/C/task4/poisson2d_serial.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+
+#include "common.h"
+
+void poisson2d_serial(int rank, int iter_max, real tol, real* restrict const Aref,
+                      real* restrict const Anew, int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+#pragma acc data present(Aref, Anew, rhs)
+    {
+#pragma acc update device(Aref [0:nx * ny], rhs [0:nx * ny])
+        while (error > tol && iter < iter_max) {
+            error = 0.0;
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < ny - 1; ix++) {
+                    Anew[iy * nx + ix] =
+                        -0.25 *
+                        (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                              Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                    error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+                }
+            }
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < ny - 1; ix++) {
+                    Aref[iy * nx + ix] = Anew[iy * nx + ix];
+                }
+            }
+
+// Periodic boundary conditions
+#pragma acc parallel loop
+            for (int ix = 1; ix < ny - 1; ix++) {
+                Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+                Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+            }
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+                Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+            }
+
+            if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+            iter++;
+        }
+#pragma acc update self(Aref [0:nx * ny])
+    }
+}
+
+int check_results(int rank, int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "[MPI%d] ERROR: A[%d][%d] = %f does not match %f (reference)\n",
+                        rank, iy, ix, A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+#ifdef MPI_VERSION
+    int global_result_correct = 0;
+    MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    result_correct = global_result_correct;
+#endif  // MPI_VERSION
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/C/task5/Makefile b/4-GPU/HandsOn/C/task5/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..9456192f9c4b4cd765a1553ae4642f656e200192
--- /dev/null
+++ b/4-GPU/HandsOn/C/task5/Makefile
@@ -0,0 +1,55 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = mpicxx
+CFLAGS = -DUSE_DOUBLE 
+
+NVSHMEM_HOME=${HOME}/nvshmem-master/build
+NVSHMEM_LIBS= -L${NVSHMEM_HOME}/lib -lnvshmem -Mcuda -lcuda -lrt 
+NVSHMEM_INC = -I${NVSHMEM_HOME}/include
+
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi
+
+TASK=5
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) ${NVSHMEM_INC} poisson2d.c poisson2d_serial.o -o poisson2d ${NVSHMEM_LIBS}
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) ${NVSHMEM_INC} poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution ${NVSHMEM_LIBS}
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof
diff --git a/4-GPU/HandsOn/C/task5/common.h b/4-GPU/HandsOn/C/task5/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..06d65fb3272fa9baa8a8f74e3d8208b76c0f19c8
--- /dev/null
+++ b/4-GPU/HandsOn/C/task5/common.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+typedef struct
+{
+    int y;
+    int x;
+} dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+void poisson2d_serial( int rank, int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int rank, int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/C/task5/poisson2d.c b/4-GPU/HandsOn/C/task5/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..3d91e14823e6f137a4c8cc84578c359b226625b7
--- /dev/null
+++ b/4-GPU/HandsOn/C/task5/poisson2d.c
@@ -0,0 +1,238 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <mpi.h>
+#include <openacc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+// NVSHMEM
+#include <nvshmem.h>
+#include <nvshmemx.h>
+
+// Helper function to map existing device allocation to host allocation for NVSHMEM
+void map(real *restrict harr, real *restrict darr, int size) { acc_map_data(harr, darr, size); }
+
+int main(int argc, char **argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    int rank = 0;
+    int size = 1;
+
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // NVSHMEM
+    MPI_Comm mpi_comm = MPI_COMM_WORLD;
+    nvshmemx_init_attr_t attr;
+    attr.mpi_comm = &mpi_comm;
+    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
+
+#pragma acc set device_num(rank)
+
+    real *restrict const A = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Aref = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Anew = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const rhs = (real *)malloc(nx * ny * sizeof(real));
+
+    // NVSHMEM
+    real *d_A = (real *)nvshmem_malloc(nx * ny * sizeof(real));
+    map(A, d_A, nx * ny * sizeof(real));
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    // Ensure correctness if ny%size != 0
+    int chunk_size = ceil((1.0 * ny) / size);
+
+    int iy_start = rank * chunk_size;
+    int iy_end = iy_start + chunk_size;
+
+    // Do not process boundaries
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    
+    //TODO: measuring the MPI time with asynchronous compute is not well defined. Remove it here and belows.
+    double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+//TODO: Check which parts in the while loop can be executed asynchronously by adding the async keyword
+//You might also need to use wait
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+
+#pragma acc parallel loop present(A, Anew)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+        int iy_start_top = top * chunk_size;
+        int iy_end_top = iy_start_top + chunk_size;
+
+        // Do not process boundaries
+        iy_start_top = max(iy_start_top, 1);
+        iy_end_top = min(iy_end_top, ny - 1);
+
+        int iy_start_bottom = bottom * chunk_size;
+        int iy_end_bottom = iy_start_bottom + chunk_size;
+
+        // Do not process boundaries
+        iy_start_bottom = max(iy_start_bottom, 1);
+        iy_end_bottom = min(iy_end_bottom, ny - 1);
+
+        // Halo exchange
+#pragma acc host_data use_device(A)
+        {
+            double start_mpi = MPI_Wtime();
+            // TODO: Get the CUDA stream that corresponds to the OpenACC default async stream
+            // use acc_get_cuda_stream and acc_get_default_async helper functions
+            // replace the nvshmem_double_put calls with nvshmemx_double_put_on_stream versions
+            // and also the same for the barrier
+
+            nvshmem_double_put((double *)(A + iy_end_top * nx + ix_start),
+                               (double *)(A + iy_start * nx + ix_start), (ix_end - ix_start), top);
+            nvshmem_double_put((double *)(A + (iy_start_bottom - 1) * nx + ix_start),
+                               (double *)(A + (iy_end - 1) * nx + ix_start), (ix_end - ix_start),
+                               bottom);
+            nvshmem_barrier_all();
+            mpi_time += MPI_Wtime() - start_mpi;
+        }
+
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
+        }
+    } else {
+        errors = -1;
+    }
+
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    nvshmem_free(d_A);
+    return errors;
+}
diff --git a/4-GPU/HandsOn/C/task5/poisson2d_serial.c b/4-GPU/HandsOn/C/task5/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab9a1f2ca159e1af0361f194c39e7231a1e0aa19
--- /dev/null
+++ b/4-GPU/HandsOn/C/task5/poisson2d_serial.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+
+#include "common.h"
+
+void poisson2d_serial(int rank, int iter_max, real tol, real* restrict const Aref,
+                      real* restrict const Anew, int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+#pragma acc data present(Aref, Anew, rhs)
+    {
+#pragma acc update device(Aref [0:nx * ny], rhs [0:nx * ny])
+        while (error > tol && iter < iter_max) {
+            error = 0.0;
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < ny - 1; ix++) {
+                    Anew[iy * nx + ix] =
+                        -0.25 *
+                        (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                              Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                    error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+                }
+            }
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < ny - 1; ix++) {
+                    Aref[iy * nx + ix] = Anew[iy * nx + ix];
+                }
+            }
+
+// Periodic boundary conditions
+#pragma acc parallel loop
+            for (int ix = 1; ix < ny - 1; ix++) {
+                Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+                Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+            }
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+                Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+            }
+
+            if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+            iter++;
+        }
+#pragma acc update self(Aref [0:nx * ny])
+    }
+}
+
+int check_results(int rank, int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "[MPI%d] ERROR: A[%d][%d] = %f does not match %f (reference)\n",
+                        rank, iy, ix, A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+#ifdef MPI_VERSION
+    int global_result_correct = 0;
+    MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    result_correct = global_result_correct;
+#endif  // MPI_VERSION
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/C/task6/Makefile b/4-GPU/HandsOn/C/task6/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..9ef982901088aa0f7c3b2a51eda234d4904575c6
--- /dev/null
+++ b/4-GPU/HandsOn/C/task6/Makefile
@@ -0,0 +1,55 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = mpicxx
+CFLAGS = -DUSE_DOUBLE 
+
+NVSHMEM_HOME=${HOME}/nvshmem-master/build
+NVSHMEM_LIBS= -L${NVSHMEM_HOME}/lib -lnvshmem -Mcuda -lcuda -lrt 
+NVSHMEM_INC = -I${NVSHMEM_HOME}/include
+
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi
+
+TASK=6
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) ${NVSHMEM_INC} poisson2d.c poisson2d_serial.o -o poisson2d ${NVSHMEM_LIBS}
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) ${NVSHMEM_INC} poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution ${NVSHMEM_LIBS}
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof
diff --git a/4-GPU/HandsOn/C/task6/common.h b/4-GPU/HandsOn/C/task6/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..06d65fb3272fa9baa8a8f74e3d8208b76c0f19c8
--- /dev/null
+++ b/4-GPU/HandsOn/C/task6/common.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+typedef struct
+{
+    int y;
+    int x;
+} dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+void poisson2d_serial( int rank, int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int rank, int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/C/task6/poisson2d.c b/4-GPU/HandsOn/C/task6/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..0486c22d58d3d92d543b1778c7dcbd474eb0c470
--- /dev/null
+++ b/4-GPU/HandsOn/C/task6/poisson2d.c
@@ -0,0 +1,245 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <mpi.h>
+#include <openacc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+// NVSHMEM
+#include <nvshmem.h>
+#include <nvshmemx.h>
+
+// Helper function to map existing device allocation to host allocation for NVSHMEM
+void map(real *restrict harr, real *restrict darr, int size) { acc_map_data(harr, darr, size); }
+
+int main(int argc, char **argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    int rank = 0;
+    int size = 1;
+
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // NVSHMEM
+    MPI_Comm mpi_comm = MPI_COMM_WORLD;
+    nvshmemx_init_attr_t attr;
+    attr.mpi_comm = &mpi_comm;
+    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
+
+#pragma acc set device_num(rank)
+
+    real *restrict const A = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Aref = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Anew = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const rhs = (real *)malloc(nx * ny * sizeof(real));
+
+    // NVSHMEM
+    real *d_A = (real *)nvshmem_malloc(nx * ny * sizeof(real));
+    map(A, d_A, nx * ny * sizeof(real));
+
+    // TODO: Get nvhsmem_ptr to the d_A allocation of the top and bottom PE
+    // use nvshmem_ptr(void* ptr, int pe)
+    // real * restrict d_Atop =
+    // real * restrict d_Abottom =
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    // Ensure correctness if ny%size != 0
+    int chunk_size = ceil((1.0 * ny) / size);
+
+    int iy_start = rank * chunk_size;
+    int iy_end = iy_start + chunk_size;
+
+    // Do not process boundaries
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    
+    //TODO: measuring the MPI time with asynchronous compute is not well defined. Remove it here and belows.
+    double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+   // TODO: Optional: Execute asynchronously where possible
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+//TODO: Check which parts in the while loop can be executed asynchronously by adding the async keyword
+//You might also need to use wait
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+
+        // TODO: if on upper or lower boundary also directly write in the top / bottom halo region
+        // you need to use an acc deviceptr clause to directlt use d_Atop, d_Abottom in the kernel
+#pragma acc parallel loop present(A, Anew)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+        int iy_start_top = top * chunk_size;
+        int iy_end_top = iy_start_top + chunk_size;
+
+        // Do not process boundaries
+        iy_start_top = max(iy_start_top, 1);
+        iy_end_top = min(iy_end_top, ny - 1);
+
+        int iy_start_bottom = bottom * chunk_size;
+        int iy_end_bottom = iy_start_bottom + chunk_size;
+
+        // Do not process boundaries
+        iy_start_bottom = max(iy_start_bottom, 1);
+        iy_end_bottom = min(iy_end_bottom, ny - 1);
+
+        // Halo exchange
+#pragma acc host_data use_device(A)
+        {
+            double start_mpi = MPI_Wtime();
+
+            // TODO: Remove the explicit put as this is no longer needed
+            nvshmem_double_put((double *)(A + iy_end_top * nx + ix_start),
+                               (double *)(A + iy_start * nx + ix_start), (ix_end - ix_start), top);
+            nvshmem_double_put((double *)(A + (iy_start_bottom - 1) * nx + ix_start),
+                               (double *)(A + (iy_end - 1) * nx + ix_start), (ix_end - ix_start),
+                               bottom);
+            //TODO: When using async get the CUDA stream that corresponds to the OpenACC default async stream
+            // replace the barrier with one on the stream, use nvshmemx_barrier_on_stream
+            nvshmem_barrier_all();
+            mpi_time += MPI_Wtime() - start_mpi;
+        }
+
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
+        }
+    } else {
+        errors = -1;
+    }
+
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    nvshmem_free(d_A);
+    return errors;
+}
diff --git a/4-GPU/HandsOn/C/task6/poisson2d_serial.c b/4-GPU/HandsOn/C/task6/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab9a1f2ca159e1af0361f194c39e7231a1e0aa19
--- /dev/null
+++ b/4-GPU/HandsOn/C/task6/poisson2d_serial.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+
+#include "common.h"
+
+void poisson2d_serial(int rank, int iter_max, real tol, real* restrict const Aref,
+                      real* restrict const Anew, int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+#pragma acc data present(Aref, Anew, rhs)
+    {
+#pragma acc update device(Aref [0:nx * ny], rhs [0:nx * ny])
+        while (error > tol && iter < iter_max) {
+            error = 0.0;
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < ny - 1; ix++) {
+                    Anew[iy * nx + ix] =
+                        -0.25 *
+                        (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                              Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                    error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+                }
+            }
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < ny - 1; ix++) {
+                    Aref[iy * nx + ix] = Anew[iy * nx + ix];
+                }
+            }
+
+// Periodic boundary conditions
+#pragma acc parallel loop
+            for (int ix = 1; ix < ny - 1; ix++) {
+                Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+                Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+            }
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+                Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+            }
+
+            if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+            iter++;
+        }
+#pragma acc update self(Aref [0:nx * ny])
+    }
+}
+
+int check_results(int rank, int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "[MPI%d] ERROR: A[%d][%d] = %f does not match %f (reference)\n",
+                        rank, iy, ix, A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+#ifdef MPI_VERSION
+    int global_result_correct = 0;
+    MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    result_correct = global_result_correct;
+#endif  // MPI_VERSION
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/FORTRAN/task0/Makefile b/4-GPU/HandsOn/FORTRAN/task0/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..e00f4b787321ba191f3e6565a066863e0f8812bb
--- /dev/null
+++ b/4-GPU/HandsOn/FORTRAN/task0/Makefile
@@ -0,0 +1,53 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+FC = pgfortran
+ifeq ($(COMPILER),GCC)
+FCFLAGS = -freal-4-real-8 -DMPI_REAL_TYPE=MPI_REAL8
+else
+FCFLAGS = -r8 -DMPI_REAL_TYPE=MPI_REAL8
+endif
+ifeq ($(COMPILER),GCC)
+	FCFLAGS += -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI)
+	FCFLAGS += -fast
+else ifeq ($(COMPILER),PGI-tesla)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,managed
+else ifeq ($(COMPILER),PGI-multicore)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+
+TASK=0
+NP ?= 1
+PGPROF=pgprof -f --cpu-profiling off
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.F03 Makefile
+	$(FC) -c $(FCFLAGS) poisson2d_serial.F03 -o poisson2d_serial.o
+
+poisson2d: poisson2d.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.F03 poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.solution.F03 poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.solution.pgprof poisson2d.pgprof
+
+run: poisson2d
+	${SC19_SUBMIT_CMD} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD} ${PGPROF}  -f -o ${SC19_DIR_SCRATCH}/poisson2d.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.pgprof .
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD} ${PGPROF}  -o ${SC19_DIR_SCRATCH}/poisson2d.solution.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.pgprof .
\ No newline at end of file
diff --git a/4-GPU/HandsOn/FORTRAN/task0/poisson2d.F03 b/4-GPU/HandsOn/FORTRAN/task0/poisson2d.F03
new file mode 100644
index 0000000000000000000000000000000000000000..2ba3475ba8bfb1b5bbb9e85c681d8f8b4715f5cb
--- /dev/null
+++ b/4-GPU/HandsOn/FORTRAN/task0/poisson2d.F03
@@ -0,0 +1,149 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+PROGRAM poisson2d
+    IMPLICIT NONE
+    INTEGER, PARAMETER :: MAX_ITER_MAX_DIGITS = 512
+    INTEGER, PARAMETER :: nx = 2048
+    INTEGER, PARAMETER :: ny = 2048
+    REAL, PARAMETER :: tol = 1.0E-5
+    INTEGER :: i,ix, iy, ix_start, ix_end, iy_start, iy_end, iter, iter_max, ierror
+    REAL :: x,y, error
+    REAL*8 :: runtime_cpu, runtime, start, finish
+    LOGICAL, EXTERNAL :: check_results
+    LOGICAL :: errors
+    REAL, DIMENSION(:,:), ALLOCATABLE :: a, a_ref, a_new, rhs
+    CHARACTER(MAX_ITER_MAX_DIGITS) :: iter_max_arg
+    
+    call getarg(1, iter_max_arg)
+    
+    IF ( iter_max_arg == '' ) THEN
+        iter_max = 500
+    ELSE
+        read (iter_max_arg, *) iter_max
+    ENDIF
+    
+    ALLOCATE( a(nx,ny) )
+    ALLOCATE( a_ref(nx,ny) )
+    ALLOCATE( a_new(nx,ny) )
+    ALLOCATE( rhs(nx,ny) )
+    
+    a = 0.0
+    a_ref = 0.0
+    
+    DO iy = 2, ny-1
+        DO ix = 2, nx-1
+            x = -1.0 + (2.0*ix/(nx-1.0))
+            y = -1.0 + (2.0*iy/(ny-1.0))
+            rhs(ix,iy) = EXP(-10.0*(x*x+y*y))
+        END DO
+    END DO
+    
+    ix_start = 2
+    ix_end   = nx-1
+    
+    iy_start = 2
+    iy_end = ny-1
+    
+    !OpenACC Warm-up
+    !$acc parallel loop
+    DO iy = 1, ny
+        DO ix = 1, nx
+            a(ix,iy) = 0.0
+        END DO
+    END DO
+    !$acc end parallel
+        
+
+    WRITE(*,"('Jacobi relaxation Calculation: ',I4,' x ',I4,' mesh')") nx,ny
+    WRITE(*,*) 'Calculate reference solution and time serial CPU execution.'
+    call cpu_time(start)
+    CALL poisson2d_serial( nx, ny, iter_max, tol, a_ref, a_new, rhs )
+    call cpu_time(finish)
+    runtime_cpu = finish-start
+    
+    WRITE(*,*) 'GPU execution.'
+    
+    call cpu_time(start)
+    iter = 1
+    error = 1.0
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        !TODO: Parallelize loop nest with OpenACC
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a(ix+1,iy) + a(ix-1,iy) + a(ix,iy-1) + a(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a(ix,iy) ) )
+            END DO
+        END DO
+        
+        
+        !TODO: Parallelize loop nest with OpenACC
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        
+        
+        !TODO: Parallelize loop nest with OpenACC
+        DO ix = ix_start, ix_end
+            a(ix,1) = a(ix,ny-1)
+            a(ix,ny) = a(ix,2)
+        END DO
+        
+        !TODO: Parallelize loop nest with OpenACC
+        DO iy = iy_start, iy_end
+            a(1,iy) = a(nx-1,iy)
+            a(nx,iy) = a(2,iy)
+        END DO
+        
+
+        IF ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+    call cpu_time(finish)
+    runtime = finish-start
+    
+    errors = .FALSE.
+    IF ( check_results( ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref ) ) THEN
+        WRITE(*,"(I4,'x',I4,': 1 CPU: ',F8.4,' s 1 GPU: ',F8.4,' s, speedup: ',F8.2)"), &
+              nx,ny,runtime_cpu,runtime,runtime_cpu/runtime
+    ELSE
+        errors = .TRUE.
+    END IF
+
+    DEALLOCATE( rhs )
+    DEALLOCATE( a_new )
+    DEALLOCATE( a_ref )
+    DEALLOCATE( a )
+    IF ( errors ) THEN
+        STOP -1
+    END IF
+END PROGRAM poisson2d
diff --git a/4-GPU/HandsOn/FORTRAN/task0/poisson2d_serial.F03 b/4-GPU/HandsOn/FORTRAN/task0/poisson2d_serial.F03
new file mode 100644
index 0000000000000000000000000000000000000000..da5f37d95e2032ff21053c0f80f934a9685b8a23
--- /dev/null
+++ b/4-GPU/HandsOn/FORTRAN/task0/poisson2d_serial.F03
@@ -0,0 +1,106 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+SUBROUTINE poisson2d_serial(nx, ny, iter_max ,tol,a_ref, a_new,rhs)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: nx, ny, iter_max
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(INOUT) :: a_ref, a_new
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: rhs
+    INTEGER :: ix,iy, iter
+    REAL :: error
+    
+    iter = 1
+    error = 1.0
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a_ref(ix+1,iy) + a_ref(ix-1,iy) + a_ref(ix,iy-1) + a_ref(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a_ref(ix,iy) ) )
+            END DO
+        END DO
+        
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_ref(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        
+        DO ix = 2, nx-1
+            a_ref(ix,1) = a_ref(ix,ny-1)
+            a_ref(ix,ny) = a_ref(ix,2)
+        END DO
+        
+        DO iy = 2, ny-1
+            a_ref(1,iy) = a_ref(nx-1,iy)
+            a_ref(nx,iy) = a_ref(2,iy)
+        END DO
+        
+        IF ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+END SUBROUTINE poisson2d_serial
+
+LOGICAL FUNCTION check_results( ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref )
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: ix_start, ix_end, iy_start, iy_end, nx, ny
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: a, a_ref
+    INTEGER :: ix,iy,ierror
+    LOGICAL :: no_errors, global_no_errors, mpi_is_initialized
+    no_errors = .TRUE.
+    iy = iy_start
+    ix = ix_start
+    DO WHILE ( iy <= iy_end .AND. no_errors )
+        DO WHILE ( ix <= ix_end .AND. no_errors )
+            IF ( ABS( a_ref(ix,iy) - a(ix,iy)) >= tol ) THEN
+                WRITE(*,"('ERROR: a(',I4,',',I4,') = ',F8.6,' does not match ',F8.6,' (reference)')") &
+                    ix,iy,a(ix,iy),a_ref(ix,iy)
+                no_errors = .FALSE.
+            END IF
+            ix = ix + 1
+        END DO
+        iy = iy + 1
+    END DO
+    check_results = no_errors
+END FUNCTION check_results
+
+SUBROUTINE size_to_2Dsize(mpi_size, mpi_sizex, mpi_sizey)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: mpi_size
+    INTEGER, INTENT(OUT) :: mpi_sizex, mpi_sizey
+    INTEGER, DIMENSION(2,16), PARAMETER :: size_to_size2d_map = reshape( &
+        (/ 1,1 , 2,1 , 3,1  , 2,2 , &
+           5,1 , 3,2 , 7,1  , 4,2 , &
+           3,3 , 5,2 , 11,1 , 6,2 , &
+          13,1 , 7,2 , 5,3  , 4,4 /), (/ 2, 16 /) )
+    mpi_sizex = size_to_size2d_map(2,mpi_size)
+    mpi_sizey = size_to_size2d_map(1,mpi_size)
+END SUBROUTINE size_to_2Dsize
diff --git a/4-GPU/HandsOn/FORTRAN/task1/Makefile b/4-GPU/HandsOn/FORTRAN/task1/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..2bfb79cdedc68ae33659e1149f62aad6f2741a52
--- /dev/null
+++ b/4-GPU/HandsOn/FORTRAN/task1/Makefile
@@ -0,0 +1,62 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+FC = pgfortran
+ifeq ($(COMPILER),GCC)
+FCFLAGS = -freal-4-real-8 -DMPI_REAL_TYPE=MPI_REAL8
+else
+FCFLAGS = -r8 -DMPI_REAL_TYPE=MPI_REAL8
+endif
+ifeq ($(COMPILER),GCC)
+	FCFLAGS += -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI)
+	FCFLAGS += -fast
+else ifeq ($(COMPILER),PGI-tesla)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,managed,lineinfo
+else ifeq ($(COMPILER),PGI-multicore)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+
+TASK=1
+NP ?= 1
+PGPROF=pgprof -f --cpu-profiling off
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.F03 Makefile
+	$(FC) -c $(FCFLAGS) poisson2d_serial.F03 -o poisson2d_serial.o
+
+poisson2d: poisson2d.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.F03 poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.solution.F03 poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.solution.*.pgprof poisson2d.*.pgprof *.tar.gz *.csv
+
+run: poisson2d
+	${SC19_SUBMIT_CMD} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.timeline.pgprof ./poisson2d 3
+	${SC19_SUBMIT_CMD} ${PGPROF} --analysis-metrics -o ${SC19_DIR_SCRATCH}/poisson2d.metrics.pgprof ./poisson2d 3
+	${SC19_SUBMIT_CMD} ${PGPROF}  --metrics gld_efficiency,gst_efficiency -o ${SC19_DIR_SCRATCH}/poisson2d.efficiency.pgprof ./poisson2d 3
+	pgprof --csv -i ${SC19_DIR_SCRATCH}/poisson2d.efficiency.pgprof 2>&1 | grep -v "======" > poisson2d.efficiency.csv
+	mv ${SC19_DIR_SCRATCH}/poisson2d.*.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.timeline.pgprof poisson2d.metrics.pgprof
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+		${SC19_SUBMIT_CMD} ${PGPROF}  -o ${SC19_DIR_SCRATCH}/poisson2d.solution.timeline.pgprof ./poisson2d.solution 3
+		${SC19_SUBMIT_CMD} ${PGPROF}  --analysis-metrics -o ${SC19_DIR_SCRATCH}/poisson2d.solution.metrics.pgprof ./poisson2d.solution 3
+		${SC19_SUBMIT_CMD} ${PGPROF}  --metrics gld_efficiency,gst_efficiency -o ${SC19_DIR_SCRATCH}/poisson2d.solution.efficiency.pgprof ./poisson2d.solution 3
+		pgprof --csv -i ${SC19_DIR_SCRATCH}/poisson2d.solution.efficiency.pgprof 2>&1 | grep -v "======" > poisson2d.solution.efficiency.csv
+		mv ${SC19_DIR_SCRATCH}/poisson2d.solution.*.pgprof .
+		tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.*.pgprof
+		
\ No newline at end of file
diff --git a/4-GPU/HandsOn/FORTRAN/task1/poisson2d.F03 b/4-GPU/HandsOn/FORTRAN/task1/poisson2d.F03
new file mode 100644
index 0000000000000000000000000000000000000000..a88858fb8c00fd8eab7d4b65e047c65bc7f3da98
--- /dev/null
+++ b/4-GPU/HandsOn/FORTRAN/task1/poisson2d.F03
@@ -0,0 +1,150 @@
+! Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+PROGRAM poisson2d
+    IMPLICIT NONE
+    INTEGER, PARAMETER :: MAX_ITER_MAX_DIGITS = 512
+    INTEGER, PARAMETER :: nx = 2048
+    INTEGER, PARAMETER :: ny = 2048
+    REAL, PARAMETER :: tol = 1.0E-5
+    INTEGER :: i,ix, iy, ix_start, ix_end, iy_start, iy_end, iter, iter_max, ierror
+    REAL :: x,y, error
+    REAL*8 :: runtime_cpu, runtime, start, finish
+    LOGICAL, EXTERNAL :: check_results
+    LOGICAL :: errors
+    REAL, DIMENSION(:,:), ALLOCATABLE :: a, a_ref, a_new, rhs
+    CHARACTER(MAX_ITER_MAX_DIGITS) :: iter_max_arg
+    
+    call getarg(1, iter_max_arg)
+    
+    IF ( iter_max_arg == '' ) THEN
+        iter_max = 500
+    ELSE
+        read (iter_max_arg, *) iter_max
+    ENDIF
+    
+    ALLOCATE( a(nx,ny) )
+    ALLOCATE( a_ref(nx,ny) )
+    ALLOCATE( a_new(nx,ny) )
+    ALLOCATE( rhs(nx,ny) )
+    
+    a = 0.0
+    a_ref = 0.0
+    
+    DO iy = 2, ny-1
+        DO ix = 2, nx-1
+            x = -1.0 + (2.0*ix/(nx-1.0))
+            y = -1.0 + (2.0*iy/(ny-1.0))
+            rhs(ix,iy) = EXP(-10.0*(x*x+y*y))
+        END DO
+    END DO
+    
+    ix_start = 2
+    ix_end   = nx-1
+    
+    iy_start = 2
+    iy_end = ny-1
+    
+    !OpenACC Warm-up
+    !$acc parallel loop
+    DO iy = 1, ny
+        DO ix = 1, nx
+            a(ix,iy) = 0.0
+        END DO
+    END DO
+    !$acc end parallel
+        
+
+    WRITE(*,"('Jacobi relaxation Calculation: ',I4,' x ',I4,' mesh')") nx,ny
+    WRITE(*,*) 'Calculate reference solution and time serial CPU execution.'
+    call cpu_time(start)
+    CALL poisson2d_serial( nx, ny, iter_max, tol, a_ref, a_new, rhs )
+    call cpu_time(finish)
+    runtime_cpu = finish-start
+    
+    WRITE(*,*) 'GPU execution.'
+    
+    call cpu_time(start)
+    iter = 1
+    error = 1.0
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        !$acc parallel loop
+        DO ix = ix_start, ix_end
+            DO iy = iy_start, iy_end
+                !TODO: Fix memory access pattern
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a(ix+1,iy) + a(ix-1,iy) + a(ix,iy-1) + a(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a(ix,iy) ) )
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO ix = ix_start, ix_end
+            a(ix,1) = a(ix,ny-1)
+            a(ix,ny) = a(ix,2)
+        END DO
+        !$acc end parallel
+        !$acc parallel loop
+        DO iy = iy_start, iy_end
+            a(1,iy) = a(nx-1,iy)
+            a(nx,iy) = a(2,iy)
+        END DO
+        !$acc end parallel
+
+        IF ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+    call cpu_time(finish)
+    runtime = finish-start
+    
+    errors = .FALSE.
+    IF ( check_results( ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref ) ) THEN
+        WRITE(*,"(I4,'x',I4,': 1 CPU: ',F8.4,' s 1 GPU: ',F8.4,' s, speedup: ',F8.2)"), &
+              nx,ny,runtime_cpu,runtime,runtime_cpu/runtime
+    ELSE
+        errors = .TRUE.
+    END IF
+
+    DEALLOCATE( rhs )
+    DEALLOCATE( a_new )
+    DEALLOCATE( a_ref )
+    DEALLOCATE( a )
+    IF ( errors ) THEN
+        STOP -1
+    END IF
+END PROGRAM poisson2d
diff --git a/4-GPU/HandsOn/FORTRAN/task1/poisson2d_serial.F03 b/4-GPU/HandsOn/FORTRAN/task1/poisson2d_serial.F03
new file mode 100644
index 0000000000000000000000000000000000000000..da5f37d95e2032ff21053c0f80f934a9685b8a23
--- /dev/null
+++ b/4-GPU/HandsOn/FORTRAN/task1/poisson2d_serial.F03
@@ -0,0 +1,106 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+SUBROUTINE poisson2d_serial(nx, ny, iter_max ,tol,a_ref, a_new,rhs)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: nx, ny, iter_max
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(INOUT) :: a_ref, a_new
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: rhs
+    INTEGER :: ix,iy, iter
+    REAL :: error
+    
+    iter = 1
+    error = 1.0
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a_ref(ix+1,iy) + a_ref(ix-1,iy) + a_ref(ix,iy-1) + a_ref(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a_ref(ix,iy) ) )
+            END DO
+        END DO
+        
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_ref(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        
+        DO ix = 2, nx-1
+            a_ref(ix,1) = a_ref(ix,ny-1)
+            a_ref(ix,ny) = a_ref(ix,2)
+        END DO
+        
+        DO iy = 2, ny-1
+            a_ref(1,iy) = a_ref(nx-1,iy)
+            a_ref(nx,iy) = a_ref(2,iy)
+        END DO
+        
+        IF ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+END SUBROUTINE poisson2d_serial
+
+LOGICAL FUNCTION check_results( ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref )
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: ix_start, ix_end, iy_start, iy_end, nx, ny
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: a, a_ref
+    INTEGER :: ix,iy,ierror
+    LOGICAL :: no_errors, global_no_errors, mpi_is_initialized
+    no_errors = .TRUE.
+    iy = iy_start
+    ix = ix_start
+    DO WHILE ( iy <= iy_end .AND. no_errors )
+        DO WHILE ( ix <= ix_end .AND. no_errors )
+            IF ( ABS( a_ref(ix,iy) - a(ix,iy)) >= tol ) THEN
+                WRITE(*,"('ERROR: a(',I4,',',I4,') = ',F8.6,' does not match ',F8.6,' (reference)')") &
+                    ix,iy,a(ix,iy),a_ref(ix,iy)
+                no_errors = .FALSE.
+            END IF
+            ix = ix + 1
+        END DO
+        iy = iy + 1
+    END DO
+    check_results = no_errors
+END FUNCTION check_results
+
+SUBROUTINE size_to_2Dsize(mpi_size, mpi_sizex, mpi_sizey)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: mpi_size
+    INTEGER, INTENT(OUT) :: mpi_sizex, mpi_sizey
+    INTEGER, DIMENSION(2,16), PARAMETER :: size_to_size2d_map = reshape( &
+        (/ 1,1 , 2,1 , 3,1  , 2,2 , &
+           5,1 , 3,2 , 7,1  , 4,2 , &
+           3,3 , 5,2 , 11,1 , 6,2 , &
+          13,1 , 7,2 , 5,3  , 4,4 /), (/ 2, 16 /) )
+    mpi_sizex = size_to_size2d_map(2,mpi_size)
+    mpi_sizey = size_to_size2d_map(1,mpi_size)
+END SUBROUTINE size_to_2Dsize
diff --git a/4-GPU/HandsOn/FORTRAN/task2/Makefile b/4-GPU/HandsOn/FORTRAN/task2/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..46380b18350a61f0fcc8297db6a6c423f99343ee
--- /dev/null
+++ b/4-GPU/HandsOn/FORTRAN/task2/Makefile
@@ -0,0 +1,54 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+FC = mpifort
+ifeq ($(COMPILER),GCC)
+FCFLAGS = -freal-4-real-8 -DMPI_REAL_TYPE=MPI_REAL8
+else
+FCFLAGS = -r8 -DMPI_REAL_TYPE=MPI_REAL8
+endif
+ifeq ($(COMPILER),GCC)
+	FCFLAGS += -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --annotate-mpi openmpi
+
+TASK=2
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.F03 Makefile
+	$(FC) -c $(FCFLAGS) poisson2d_serial.F03 -o poisson2d_serial.o
+
+poisson2d: poisson2d.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.F03 poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.solution.F03 poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof 
\ No newline at end of file
diff --git a/4-GPU/HandsOn/FORTRAN/task2/poisson2d.F03 b/4-GPU/HandsOn/FORTRAN/task2/poisson2d.F03
new file mode 100644
index 0000000000000000000000000000000000000000..c5f5ae58e54e78da0cc2c39d338f1d0eeadc91ca
--- /dev/null
+++ b/4-GPU/HandsOn/FORTRAN/task2/poisson2d.F03
@@ -0,0 +1,239 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+PROGRAM poisson2d
+#if _OPENACC
+    USE openacc
+#endif
+    USE mpi
+    IMPLICIT NONE
+    INTEGER, PARAMETER :: MAX_ITER_MAX_DIGITS = 512
+    INTEGER, PARAMETER :: nx = 4096
+    INTEGER, PARAMETER :: ny = 4096
+    REAL, PARAMETER :: tol = 1.0E-5
+    INTEGER :: i,ix, iy, ix_start, ix_end, iy_start, iy_end, iter, iter_max, mpi_rank, mpi_size, device_type, ngpus, devicenum, ierror
+    INTEGER :: chunk_size, right, left
+    REAL :: x,y, error, globalerror
+    REAL*8 :: runtime_serial, runtime, start, finish, mpi_time, mpi_start_time
+    LOGICAL, EXTERNAL :: check_results
+    LOGICAL :: errors
+    REAL, DIMENSION(:,:), ALLOCATABLE :: a, a_ref, a_new, rhs
+    CHARACTER(MAX_ITER_MAX_DIGITS) :: iter_max_arg
+    
+    call getarg(1, iter_max_arg)
+    
+    IF ( iter_max_arg == '' ) THEN
+        iter_max = 1000
+    ELSE
+        read (iter_max_arg, *) iter_max
+    ENDIF
+    
+    mpi_rank = 0
+    mpi_size = 1
+    
+    !Initialize MPI and determine rank and size
+    CALL MPI_Init(ierror)
+    CALL MPI_Comm_rank(MPI_COMM_WORLD,mpi_rank,ierror)
+    CALL MPI_Comm_size(MPI_COMM_WORLD,mpi_size,ierror)
+    
+    !TODO: handle device affinity
+    
+    ALLOCATE( a(nx,ny) )
+    ALLOCATE( a_ref(nx,ny) )
+    ALLOCATE( a_new(nx,ny) )
+    ALLOCATE( rhs(nx,ny) )
+    
+    a = 0.0
+    a_ref = 0.0
+    
+    DO iy = 2, ny-1
+        DO ix = 2, nx-1
+            x = -1.0 + (2.0*ix/(nx-1.0))
+            y = -1.0 + (2.0*iy/(ny-1.0))
+            rhs(ix,iy) = EXP(-10.0*(x*x+y*y))
+        END DO
+    END DO
+    
+    !$acc enter data create(a,a_ref,a_new,rhs)
+    
+    ix_start = 2
+    ix_end   = nx-1
+    
+    !TODO: set first and last row to be processed by this rank.
+    iy_start = 2
+    iy_end = ny-1
+    
+    !OpenACC Warm-up
+    !$acc parallel loop present(a,a_ref)
+    DO iy = 1, ny
+        DO ix = 1, nx
+            a(ix,iy) = 0.0
+            a_ref(ix,iy) = 0.0
+        END DO
+    END DO
+    
+        
+    !MPI Warm-up to establish CUDA IPC connections
+    DO i = 1,2
+        left = mpi_rank-1
+        IF ( mpi_rank == 0 ) THEN
+            left = mpi_size-1
+        END IF
+        right = mpi_rank+1
+        IF ( mpi_rank == mpi_size-1 ) THEN
+            right = 0
+        END IF
+        !$acc host_data use_device( a )
+            !1. Sent column iy_start (first modified column) to left receive right boundary (iy_end+1) from right
+            CALL MPI_Sendrecv( a(ix_start,iy_start), (ix_end-ix_start)+1, MPI_REAL_TYPE, left   , 0, &
+                               a(ix_start,iy_end+1), (ix_end-ix_start)+1, MPI_REAL_TYPE, right, 0, &
+                               MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierror )
+
+            !2. Sent column iy_end (last modified column) to right receive left boundary (iy_start-1) from left
+            CALL MPI_Sendrecv( a(ix_start,iy_end), (ix_end-ix_start)+1, MPI_REAL_TYPE, right, 0, &
+                               a(ix_start,(iy_start-1)), (ix_end-ix_start)+1, MPI_REAL_TYPE, left   , 0, &
+                               MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierror )
+        !$acc end host_data
+    END DO
+    
+    !Wait for all processes to finish Warm-up
+    CALL MPI_Barrier( MPI_COMM_WORLD, ierror )
+
+    IF ( mpi_rank == 0 ) THEN
+        WRITE(*,"('Jacobi relaxation Calculation: ',I4,' x ',I4,' mesh')") nx,ny
+        WRITE(*,*) 'Calculate reference solution and time serial execution.'
+        !Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        start = MPI_WTIME()
+        CALL poisson2d_serial( nx, ny, iter_max, mpi_rank, tol, a_ref, a_new, rhs )
+        finish = MPI_WTIME()
+        runtime_serial = finish-start
+    END IF
+    CALL MPI_Bcast(a_ref, size(a_ref), MPI_REAL_TYPE, 0, MPI_COMM_WORLD, ierror)
+    
+    !Wait for all processes to ensure correct timing of the parallel version
+    CALL MPI_Barrier( MPI_COMM_WORLD, ierror )
+    
+    IF ( mpi_rank == 0 ) THEN
+        WRITE(*,*) 'Parallel execution.'
+    END IF 
+    
+    mpi_time = 0.0
+    start = MPI_WTIME()
+    iter = 1
+    error = 1.0
+    !$acc update device(a(1:nx,iy_start:iy_end),rhs(1:nx,iy_start:iy_end))
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        !$acc parallel loop present(a,a_new,rhs)
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a(ix+1,iy) + a(ix-1,iy) + a(ix,iy-1) + a(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a(ix,iy) ) )
+            END DO
+        END DO
+        !$acc end parallel
+        !Calculate global error across all ranks
+        globalerror = 0.0
+        call MPI_Allreduce( error, globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD, ierror )
+        error = globalerror
+        
+        !$acc parallel loop present(a,a_new)
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !TODO: Handle periodic boundary conditions and halo exchange with MPI
+        !$acc parallel loop
+        DO ix = ix_start, ix_end
+            a(ix,iy_start-1) = a(ix,iy_end)
+            a(ix,iy_end+1) = a(ix,iy_start)
+        END DO
+        !$acc end parallel
+        left = mpi_rank-1
+        IF ( mpi_rank == 0 ) THEN
+            left = mpi_size-1
+        END IF
+        right = mpi_rank+1
+        IF ( mpi_rank == mpi_size-1 ) THEN
+            right = 0
+        END IF
+        
+        mpi_start_time = MPI_WTIME()
+        !TODO: Pass device ptr of A to MPI using host_data use_device
+            !TODO: 1. Sent column iy_start (first modified column) to left receive right boundary (iy_end+1) from right
+            ! CALL MPI_SENDRECV(SENDBUF, SENDCOUNT, MPI_REAL_TYPE, DEST, 0, RECVBUF, RECVCOUNT, MPI_REAL_TYPE, SOURCE, 0, MPI_COMM_WORLD,MPI_STATUS_IGNORE, ierror)
+
+            !TODO: 2. Sent column iy_end (last modified column) to right receive left boundary (iy_start-1) from left
+            ! CALL MPI_SENDRECV(SENDBUF, SENDCOUNT, MPI_REAL_TYPE, DEST, 0, RECVBUF, RECVCOUNT, MPI_REAL_TYPE, SOURCE, 0, MPI_COMM_WORLD,MPI_STATUS_IGNORE, ierror)
+        !TODO: !$acc end host_data
+        mpi_time = (MPI_WTIME() - mpi_start_time) + mpi_time
+        
+        !$acc parallel loop present(a)
+        DO iy = iy_start, iy_end
+            a(1,iy) = a(nx-1,iy)
+            a(nx,iy) = a(2,iy)
+        END DO
+        !$acc end parallel
+
+        IF ( mpi_rank == 0 .AND. ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+    !$acc update self(a(1:nx,iy_start:iy_end))
+    !Wait for all processes to ensure correct timing of the parallel version
+    CALL MPI_Barrier( MPI_COMM_WORLD, ierror )
+    finish = MPI_WTIME()
+    runtime = finish-start
+    
+    errors = .FALSE.
+    IF ( check_results( mpi_rank, ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref ) ) THEN
+        IF ( mpi_rank == 0 ) THEN
+            WRITE(*,*) 'Num GPUs: ', mpi_size
+            WRITE(*,"(I4,'x',I4,': 1 GPU: ',F8.4,' s ',I1,' GPUs: ',F8.4,' s, speedup: ',F8.2,' efficiency: ',F8.2)"), &
+                  nx,ny,runtime_serial,mpi_size,runtime,runtime_serial/runtime,runtime_serial/(mpi_size*runtime)*100
+            WRITE(*,"('MPI time: 'F8.4' s, inter GPU BW: 'F8.2' GiB/s')"), &
+                  mpi_time,(iter*4*(ix_end-ix_start)*SIZEOF(a(1,1)))/(1024*1024*1024*mpi_time)
+        END IF
+    ELSE
+        errors = .TRUE.
+    END IF
+    
+    !$acc exit data delete(a,a_ref,a_new,rhs)
+    CALL MPI_Finalize(ierror)
+    
+    DEALLOCATE( rhs )
+    DEALLOCATE( a_new )
+    DEALLOCATE( a_ref )
+    DEALLOCATE( a )
+    IF ( errors ) THEN
+        STOP -1
+    END IF
+END PROGRAM poisson2d
diff --git a/4-GPU/HandsOn/FORTRAN/task2/poisson2d_serial.F03 b/4-GPU/HandsOn/FORTRAN/task2/poisson2d_serial.F03
new file mode 100644
index 0000000000000000000000000000000000000000..8a6e0a9f25deb2e6a615e3e1ba214f48d93a4ac1
--- /dev/null
+++ b/4-GPU/HandsOn/FORTRAN/task2/poisson2d_serial.F03
@@ -0,0 +1,126 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+SUBROUTINE poisson2d_serial(nx, ny, iter_max, mpi_rank,tol,a_ref, a_new,rhs)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: nx, ny, iter_max, mpi_rank
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(INOUT) :: a_ref, a_new
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: rhs
+    INTEGER :: ix,iy, iter
+    REAL :: error
+    
+    iter = 1
+    error = 1.0
+    !$acc data present(a_ref,rhs,a_new)
+    !$acc update device(a_ref,rhs)
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        !$acc parallel loop
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a_ref(ix+1,iy) + a_ref(ix-1,iy) + a_ref(ix,iy-1) + a_ref(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a_ref(ix,iy) ) )
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_ref(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO ix = 2, nx-1
+            a_ref(ix,1) = a_ref(ix,ny-1)
+            a_ref(ix,ny) = a_ref(ix,2)
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO iy = 2, ny-1
+            a_ref(1,iy) = a_ref(nx-1,iy)
+            a_ref(nx,iy) = a_ref(2,iy)
+        END DO
+        !$acc end parallel
+        
+        IF ( mpi_rank == 0 .AND. ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+    !$acc update self(a_ref)
+    !$acc end data
+END SUBROUTINE poisson2d_serial
+
+LOGICAL FUNCTION check_results( mpi_rank, ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref )
+    USE mpi
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: mpi_rank, ix_start, ix_end, iy_start, iy_end, nx, ny
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: a, a_ref
+    INTEGER :: ix,iy,ierror
+    LOGICAL :: no_errors, global_no_errors, mpi_is_initialized
+    no_errors = .TRUE.
+    iy = iy_start
+    ix = ix_start
+    DO WHILE ( iy <= iy_end .AND. no_errors )
+        DO WHILE ( ix <= ix_end .AND. no_errors )
+            IF ( ABS( a_ref(ix,iy) - a(ix,iy)) >= tol ) THEN
+                WRITE(*,"('[MPI',I1,'] ERROR: a(',I4,',',I4,') = ',F8.6,' does not match ',F8.6,' (reference)')") &
+                    mpi_rank,ix,iy,a(ix,iy),a_ref(ix,iy)
+                no_errors = .FALSE.
+            END IF
+            ix = ix + 1
+        END DO
+        iy = iy + 1
+    END DO
+    
+    CALL MPI_Initialized(mpi_is_initialized, ierror)
+    IF ( mpi_is_initialized ) THEN
+        global_no_errors = .FALSE.
+        CALL MPI_ALLREDUCE(no_errors, global_no_errors, 1, MPI_LOGICAL, MPI_LAND, MPI_COMM_WORLD, ierror)
+        no_errors = global_no_errors
+    END IF
+    check_results = no_errors
+END FUNCTION check_results
+
+SUBROUTINE size_to_2Dsize(mpi_size, mpi_sizex, mpi_sizey)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: mpi_size
+    INTEGER, INTENT(OUT) :: mpi_sizex, mpi_sizey
+    INTEGER, DIMENSION(2,16), PARAMETER :: size_to_size2d_map = reshape( &
+        (/ 1,1 , 2,1 , 3,1  , 2,2 , &
+           5,1 , 3,2 , 7,1  , 4,2 , &
+           3,3 , 5,2 , 11,1 , 6,2 , &
+          13,1 , 7,2 , 5,3  , 4,4 /), (/ 2, 16 /) )
+    mpi_sizex = size_to_size2d_map(2,mpi_size)
+    mpi_sizey = size_to_size2d_map(1,mpi_size)
+END SUBROUTINE size_to_2Dsize
diff --git a/4-GPU/HandsOn/FORTRAN/task3/Makefile b/4-GPU/HandsOn/FORTRAN/task3/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..62dcaf89c710b1f2839a4088d87f7ef7df4f9311
--- /dev/null
+++ b/4-GPU/HandsOn/FORTRAN/task3/Makefile
@@ -0,0 +1,54 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+FC = mpifort
+ifeq ($(COMPILER),GCC)
+FCFLAGS = -freal-4-real-8 -DMPI_REAL_TYPE=MPI_REAL8
+else
+FCFLAGS = -r8 -DMPI_REAL_TYPE=MPI_REAL8
+endif
+ifeq ($(COMPILER),GCC)
+	FCFLAGS += -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --annotate-mpi openmpi
+
+TASK=3
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.F03 Makefile
+	$(FC) -c $(FCFLAGS) poisson2d_serial.F03 -o poisson2d_serial.o
+
+poisson2d: poisson2d.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.F03 poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.solution.F03 poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof 
\ No newline at end of file
diff --git a/4-GPU/HandsOn/FORTRAN/task3/poisson2d.F03 b/4-GPU/HandsOn/FORTRAN/task3/poisson2d.F03
new file mode 100644
index 0000000000000000000000000000000000000000..d341dd7328c8f8cdcbe94c0f4e47b2cfaf3aef10
--- /dev/null
+++ b/4-GPU/HandsOn/FORTRAN/task3/poisson2d.F03
@@ -0,0 +1,254 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+PROGRAM poisson2d
+#if _OPENACC
+    USE openacc
+#endif
+    USE mpi
+    IMPLICIT NONE
+    INTEGER, PARAMETER :: MAX_ITER_MAX_DIGITS = 512
+    INTEGER, PARAMETER :: nx = 4096
+    INTEGER, PARAMETER :: ny = 4096
+    REAL, PARAMETER :: tol = 1.0E-5
+    INTEGER :: i,ix, iy, ix_start, ix_end, iy_start, iy_end, iter, iter_max, mpi_rank, mpi_size, device_type, ngpus, devicenum, ierror
+    INTEGER :: chunk_size, right, left
+    REAL :: x,y, error, globalerror
+    REAL*8 :: runtime_serial, runtime, start, finish, mpi_time, mpi_start_time
+    LOGICAL, EXTERNAL :: check_results
+    LOGICAL :: errors
+    REAL, DIMENSION(:,:), ALLOCATABLE :: a, a_ref, a_new, rhs
+    CHARACTER(MAX_ITER_MAX_DIGITS) :: iter_max_arg
+    
+    call getarg(1, iter_max_arg)
+    
+    IF ( iter_max_arg == '' ) THEN
+        iter_max = 1000
+    ELSE
+        read (iter_max_arg, *) iter_max
+    ENDIF
+    
+    mpi_rank = 0
+    mpi_size = 1
+    
+    !Initialize MPI and determine rank and size
+    CALL MPI_Init(ierror)
+    CALL MPI_Comm_rank(MPI_COMM_WORLD,mpi_rank,ierror)
+    CALL MPI_Comm_size(MPI_COMM_WORLD,mpi_size,ierror)
+    
+#if _OPENACC
+    device_type = acc_get_device_type()
+    IF ( acc_device_nvidia == device_type ) THEN
+        ngpus=acc_get_num_devices( acc_device_nvidia )
+        !choose device to use by this rank
+        devicenum = MOD( mpi_rank, ngpus )
+        call acc_set_device_num( devicenum, acc_device_nvidia )
+    END IF
+    call acc_init( device_type )
+#endif
+    
+    ALLOCATE( a(nx,ny) )
+    ALLOCATE( a_ref(nx,ny) )
+    ALLOCATE( a_new(nx,ny) )
+    ALLOCATE( rhs(nx,ny) )
+    
+    a = 0.0
+    a_ref = 0.0
+    
+    DO iy = 2, ny-1
+        DO ix = 2, nx-1
+            x = -1.0 + (2.0*ix/(nx-1.0))
+            y = -1.0 + (2.0*iy/(ny-1.0))
+            rhs(ix,iy) = EXP(-10.0*(x*x+y*y))
+        END DO
+    END DO
+    
+    !$acc enter data create(a,a_ref,a_new,rhs)
+    
+    ix_start = 2
+    ix_end   = nx-1
+    
+    !set first and last row to be processed by this rank.
+    !Ensure correctness if ny%size != 0
+    chunk_size = CEILING( (1.0*ny)/mpi_size )
+    iy_start = mpi_rank * chunk_size
+    iy_end = iy_start + chunk_size - 1
+    
+    !Do not process boundaries
+    iy_start = MAX( iy_start, 2 )
+    iy_end = MIN( iy_end, ny-1 )
+    
+    !OpenACC Warm-up
+    !$acc parallel loop present(a,a_ref)
+    DO iy = 1, ny
+        DO ix = 1, nx
+            a(ix,iy) = 0.0
+            a_ref(ix,iy) = 0.0
+        END DO
+    END DO
+    
+    !MPI Warm-up to establish CUDA IPC connections
+    DO i = 1,2
+        left = mpi_rank-1
+        IF ( mpi_rank == 0 ) THEN
+            left = mpi_size-1
+        END IF
+        right = mpi_rank+1
+        IF ( mpi_rank == mpi_size-1 ) THEN
+            right = 0
+        END IF
+        !$acc host_data use_device( a )
+            !1. Sent column iy_start (first modified column) to left receive right boundary (iy_end+1) from right
+            CALL MPI_Sendrecv( a(ix_start,iy_start), (ix_end-ix_start)+1, MPI_REAL_TYPE, left   , 0, &
+                              a(ix_start,iy_end+1), (ix_end-ix_start)+1, MPI_REAL_TYPE, right, 0, &
+                              MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierror )
+
+            !2. Sent column iy_end (last modified column) to right receive left boundary (iy_start-1) from left
+            CALL MPI_Sendrecv( a(ix_start,iy_end), (ix_end-ix_start)+1, MPI_REAL_TYPE, right, 0, &
+                               a(ix_start,(iy_start-1)), (ix_end-ix_start)+1, MPI_REAL_TYPE, left   , 0, &
+                               MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierror )
+        !$acc end host_data
+    END DO
+    
+    !Wait for all processes to finish Warm-up
+    CALL MPI_Barrier( MPI_COMM_WORLD, ierror )
+
+    IF ( mpi_rank == 0 ) THEN
+        WRITE(*,"('Jacobi relaxation Calculation: ',I4,' x ',I4,' mesh')") nx,ny
+        WRITE(*,*) 'Calculate reference solution and time serial execution.'
+        !Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        start = MPI_WTIME()
+        CALL poisson2d_serial( nx, ny, iter_max, mpi_rank, tol, a_ref, a_new, rhs )
+        finish = MPI_WTIME()
+        runtime_serial = finish-start
+    END IF
+    CALL MPI_Bcast(a_ref, size(a_ref), MPI_REAL_TYPE, 0, MPI_COMM_WORLD, ierror)
+    
+    !Wait for all processes to ensure correct timing of the parallel version
+    CALL MPI_Barrier( MPI_COMM_WORLD, ierror )
+    
+    IF ( mpi_rank == 0 ) THEN
+        WRITE(*,*) 'Parallel execution.'
+    END IF 
+    
+    mpi_time = 0.0
+    start = MPI_WTIME()
+    iter = 1
+    error = 1.0
+    !$acc update device(a(1:nx,iy_start:iy_end),rhs(1:nx,iy_start:iy_end))
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        !$acc parallel loop present(a,a_new,rhs)
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a(ix+1,iy) + a(ix-1,iy) + a(ix,iy-1) + a(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a(ix,iy) ) )
+            END DO
+        END DO
+        !$acc end parallel
+        !Calculate global error across all ranks
+        globalerror = 0.0
+        call MPI_Allreduce( error, globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD, ierror )
+        error = globalerror
+        
+        !TODO: Split into halo and bulk part
+        !$acc parallel loop present(a,a_new)
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        !$acc end parallel
+        !TODO: Start bulk part asynchronously
+        
+        !Handle periodic boundary conditions and halo exchange with MPI
+        left = mpi_rank-1
+        IF ( mpi_rank == 0 ) THEN
+            left = mpi_size-1
+        END IF
+        right = mpi_rank+1
+        IF ( mpi_rank == mpi_size-1 ) THEN
+            right = 0
+        END IF
+        
+        mpi_start_time = MPI_WTIME()
+        !$acc host_data use_device( a )
+            !1. Sent column iy_start (first modified column) to left receive right boundary (iy_end+1) from right
+            CALL MPI_Sendrecv( a(ix_start,iy_start), (ix_end-ix_start)+1, MPI_REAL_TYPE, left   , 0, &
+                               a(ix_start,iy_end+1), (ix_end-ix_start)+1, MPI_REAL_TYPE, right, 0, &
+                               MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierror )
+
+            !2. Sent column iy_end (last modified column) to right receive left boundary (iy_start-1) from left
+            CALL MPI_Sendrecv( a(ix_start,iy_end), (ix_end-ix_start)+1, MPI_REAL_TYPE, right, 0, &
+                               a(ix_start,(iy_start-1)), (ix_end-ix_start)+1, MPI_REAL_TYPE, left   , 0, &
+                               MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierror )
+        !$acc end host_data
+        mpi_time = (MPI_WTIME() - mpi_start_time) + mpi_time
+        !TODO: wait for bulk part
+        
+        !$acc parallel loop present(a)
+        DO iy = iy_start, iy_end
+            a(1,iy) = a(nx-1,iy)
+            a(nx,iy) = a(2,iy)
+        END DO
+        !$acc end parallel
+
+        IF ( mpi_rank == 0 .AND. ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+    !$acc update self(a(1:nx,iy_start:iy_end))
+    !Wait for all processes to ensure correct timing of the parallel version
+    CALL MPI_Barrier( MPI_COMM_WORLD, ierror )
+    finish = MPI_WTIME()
+    runtime = finish-start
+    
+    errors = .FALSE.
+    IF ( check_results( mpi_rank, ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref ) ) THEN
+        IF ( mpi_rank == 0 ) THEN
+            WRITE(*,*) 'Num GPUs: ', mpi_size
+            WRITE(*,"(I4,'x',I4,': 1 GPU: ',F8.4,' s ',I1,' GPUs: ',F8.4,' s, speedup: ',F8.2,' efficiency: ',F8.2)"), &
+                  nx,ny,runtime_serial,mpi_size,runtime,runtime_serial/runtime,runtime_serial/(mpi_size*runtime)*100
+            WRITE(*,"('MPI time: 'F8.4' s, inter GPU BW: 'F8.2' GiB/s')"), &
+                  mpi_time,(iter*4*(ix_end-ix_start)*SIZEOF(a(1,1)))/(1024*1024*1024*mpi_time)
+        END IF
+    ELSE
+        errors = .TRUE.
+    END IF
+    
+    !$acc exit data delete(a,a_ref,a_new,rhs)
+    CALL MPI_Finalize(ierror)
+    
+    DEALLOCATE( rhs )
+    DEALLOCATE( a_new )
+    DEALLOCATE( a_ref )
+    DEALLOCATE( a )
+    IF ( errors ) THEN
+        STOP -1
+    END IF
+END PROGRAM poisson2d
diff --git a/4-GPU/HandsOn/FORTRAN/task3/poisson2d_serial.F03 b/4-GPU/HandsOn/FORTRAN/task3/poisson2d_serial.F03
new file mode 100644
index 0000000000000000000000000000000000000000..8a6e0a9f25deb2e6a615e3e1ba214f48d93a4ac1
--- /dev/null
+++ b/4-GPU/HandsOn/FORTRAN/task3/poisson2d_serial.F03
@@ -0,0 +1,126 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+SUBROUTINE poisson2d_serial(nx, ny, iter_max, mpi_rank,tol,a_ref, a_new,rhs)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: nx, ny, iter_max, mpi_rank
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(INOUT) :: a_ref, a_new
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: rhs
+    INTEGER :: ix,iy, iter
+    REAL :: error
+    
+    iter = 1
+    error = 1.0
+    !$acc data present(a_ref,rhs,a_new)
+    !$acc update device(a_ref,rhs)
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        !$acc parallel loop
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a_ref(ix+1,iy) + a_ref(ix-1,iy) + a_ref(ix,iy-1) + a_ref(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a_ref(ix,iy) ) )
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_ref(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO ix = 2, nx-1
+            a_ref(ix,1) = a_ref(ix,ny-1)
+            a_ref(ix,ny) = a_ref(ix,2)
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO iy = 2, ny-1
+            a_ref(1,iy) = a_ref(nx-1,iy)
+            a_ref(nx,iy) = a_ref(2,iy)
+        END DO
+        !$acc end parallel
+        
+        IF ( mpi_rank == 0 .AND. ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+    !$acc update self(a_ref)
+    !$acc end data
+END SUBROUTINE poisson2d_serial
+
+LOGICAL FUNCTION check_results( mpi_rank, ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref )
+    USE mpi
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: mpi_rank, ix_start, ix_end, iy_start, iy_end, nx, ny
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: a, a_ref
+    INTEGER :: ix,iy,ierror
+    LOGICAL :: no_errors, global_no_errors, mpi_is_initialized
+    no_errors = .TRUE.
+    iy = iy_start
+    ix = ix_start
+    DO WHILE ( iy <= iy_end .AND. no_errors )
+        DO WHILE ( ix <= ix_end .AND. no_errors )
+            IF ( ABS( a_ref(ix,iy) - a(ix,iy)) >= tol ) THEN
+                WRITE(*,"('[MPI',I1,'] ERROR: a(',I4,',',I4,') = ',F8.6,' does not match ',F8.6,' (reference)')") &
+                    mpi_rank,ix,iy,a(ix,iy),a_ref(ix,iy)
+                no_errors = .FALSE.
+            END IF
+            ix = ix + 1
+        END DO
+        iy = iy + 1
+    END DO
+    
+    CALL MPI_Initialized(mpi_is_initialized, ierror)
+    IF ( mpi_is_initialized ) THEN
+        global_no_errors = .FALSE.
+        CALL MPI_ALLREDUCE(no_errors, global_no_errors, 1, MPI_LOGICAL, MPI_LAND, MPI_COMM_WORLD, ierror)
+        no_errors = global_no_errors
+    END IF
+    check_results = no_errors
+END FUNCTION check_results
+
+SUBROUTINE size_to_2Dsize(mpi_size, mpi_sizex, mpi_sizey)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: mpi_size
+    INTEGER, INTENT(OUT) :: mpi_sizex, mpi_sizey
+    INTEGER, DIMENSION(2,16), PARAMETER :: size_to_size2d_map = reshape( &
+        (/ 1,1 , 2,1 , 3,1  , 2,2 , &
+           5,1 , 3,2 , 7,1  , 4,2 , &
+           3,3 , 5,2 , 11,1 , 6,2 , &
+          13,1 , 7,2 , 5,3  , 4,4 /), (/ 2, 16 /) )
+    mpi_sizex = size_to_size2d_map(2,mpi_size)
+    mpi_sizey = size_to_size2d_map(1,mpi_size)
+END SUBROUTINE size_to_2Dsize
diff --git a/4-GPU/HandsOn/HandsOnGPUProgramming_Task.html b/4-GPU/HandsOn/HandsOnGPUProgramming.html
similarity index 90%
rename from 4-GPU/HandsOn/HandsOnGPUProgramming_Task.html
rename to 4-GPU/HandsOn/HandsOnGPUProgramming.html
index 51cfb7eb1265cf970c616f7d237b2805ece09341..50711ae98ecb3e5a96d001a8e5c3c70d609a8d3f 100644
--- a/4-GPU/HandsOn/HandsOnGPUProgramming_Task.html
+++ b/4-GPU/HandsOn/HandsOnGPUProgramming.html
@@ -2,7 +2,7 @@
 <html>
 <head><meta charset="utf-8" />
 
-<title>HandsOnGPUProgramming_Tasks</title>
+<title>HandsOnGPUProgramming</title>
 
 <script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"></script>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js"></script>
@@ -13017,45 +13017,6 @@ ul.typeahead-list  > li > a.pull-right {
 .highlight .vm { color: #19177C } /* Name.Variable.Magic */
 .highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
     </style>
-<style type="text/css">
-    
-/* Temporary definitions which will become obsolete with Notebook release 5.0 */
-.ansi-black-fg { color: #3E424D; }
-.ansi-black-bg { background-color: #3E424D; }
-.ansi-black-intense-fg { color: #282C36; }
-.ansi-black-intense-bg { background-color: #282C36; }
-.ansi-red-fg { color: #E75C58; }
-.ansi-red-bg { background-color: #E75C58; }
-.ansi-red-intense-fg { color: #B22B31; }
-.ansi-red-intense-bg { background-color: #B22B31; }
-.ansi-green-fg { color: #00A250; }
-.ansi-green-bg { background-color: #00A250; }
-.ansi-green-intense-fg { color: #007427; }
-.ansi-green-intense-bg { background-color: #007427; }
-.ansi-yellow-fg { color: #DDB62B; }
-.ansi-yellow-bg { background-color: #DDB62B; }
-.ansi-yellow-intense-fg { color: #B27D12; }
-.ansi-yellow-intense-bg { background-color: #B27D12; }
-.ansi-blue-fg { color: #208FFB; }
-.ansi-blue-bg { background-color: #208FFB; }
-.ansi-blue-intense-fg { color: #0065CA; }
-.ansi-blue-intense-bg { background-color: #0065CA; }
-.ansi-magenta-fg { color: #D160C4; }
-.ansi-magenta-bg { background-color: #D160C4; }
-.ansi-magenta-intense-fg { color: #A03196; }
-.ansi-magenta-intense-bg { background-color: #A03196; }
-.ansi-cyan-fg { color: #60C6C8; }
-.ansi-cyan-bg { background-color: #60C6C8; }
-.ansi-cyan-intense-fg { color: #258F8F; }
-.ansi-cyan-intense-bg { background-color: #258F8F; }
-.ansi-white-fg { color: #C5C1B4; }
-.ansi-white-bg { background-color: #C5C1B4; }
-.ansi-white-intense-fg { color: #A1A6B2; }
-.ansi-white-intense-bg { background-color: #A1A6B2; }
-
-.ansi-bold { font-weight: bold; }
-
-    </style>
 
 
 <style type="text/css">
@@ -13089,7 +13050,7 @@ div#notebook {
 
 <!-- Loading mathjax macro -->
 <!-- Load mathjax -->
-    <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS_HTML"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS_HTML"></script>
     <!-- MathJax configuration -->
     <script type="text/x-mathjax-config">
     MathJax.Hub.Config({
@@ -13116,7 +13077,7 @@ div#notebook {
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h1 id="Hands-On-GPU-Programming">Hands-On GPU Programming<a class="anchor-link" href="#Hands-On-GPU-Programming">&#182;</a></h1><p><em>Supercomputing 2018 Tutorial "Application Porting and Optimization on GPU-Accelerated POWER Architectures", November 12th 2018</em></p>
+<h1 id="Hands-On-GPU-Programming">Hands-On GPU Programming<a class="anchor-link" href="#Hands-On-GPU-Programming">&#182;</a></h1><p><em>Supercomputing 2019 Tutorial "Application Porting and Optimization on GPU-Accelerated POWER Architectures", November 18th 2019</em></p>
 <hr>
 
 </div>
@@ -13126,10 +13087,11 @@ div#notebook {
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
 <h3 id="Read-me-first">Read me first<a class="anchor-link" href="#Read-me-first">&#182;</a></h3><p>This tutorial is primarily designed to be executed as a <em>jupyter</em> notebook. However, everything can also be done using an <em>ssh</em> connection to <em>ascent.olcf.ornl.gov</em> in your terminal.</p>
-<h4 id="Jupyter-notebook-execution">Jupyter notebook execution<a class="anchor-link" href="#Jupyter-notebook-execution">&#182;</a></h4><p>When using jupyter this notebook will guide you through the step. Note that if you execute a cell multiple times while optimizing the code the output will be replaced. You can however duplicate the cell you want to execute and keep its output. Check the <em>edit</em> menu above.</p>
+<h4 id="Jupyter-Lab-execution">Jupyter Lab execution<a class="anchor-link" href="#Jupyter-Lab-execution">&#182;</a></h4><p>When using jupyter this notebook will guide you through the step. Note that if you execute a cell multiple times while optimizing the code the output will be replaced. You can however duplicate the cell you want to execute and keep its output. Check the <em>edit</em> menu above.</p>
 <p>You will always find links to a file browser of the corresponding task subdirectory as well as direct links to the source files you will need to edit as well as the profiling output you need to open locally.</p>
-<p>If you want you also can get a <a href="/terminals/4">terminal</a> in your browser.</p>
-<h4 id="Terminal-fallback">Terminal fallback<a class="anchor-link" href="#Terminal-fallback">&#182;</a></h4><p>The tasks are placed in directories named <code>[C/FORTRAN]/task[0-3]</code>.</p>
+<p>If you want you also can get a terminal in your browser by following the <em>File -&gt; New -&gt; Terminal</em> in the Jupyter Lab menu bar.</p>
+<h4 id="Terminal-fallback">Terminal fallback<a class="anchor-link" href="#Terminal-fallback">&#182;</a></h4><p>The tasks are placed in directories named <code>[C/FORTRAN]/task[0-6]</code>.<br>
+<em>Note: The tasks using NVHSMEM (4-6) are only available in C.</em></p>
 <p>The files you will need to edit are always the <code>poisson2d.(C|F03)</code> files.</p>
 <p>The makefile targets execute everything to compile, run and profile the code. Please take a look at the cells containing the make calls as a guide.</p>
 <p>The outputs of profiling runs be placed in the working directory of the current task and are named like <code>*.pgprof</code> or <code>pgprof.*.tar.gz</code> in case of multiple files. You can use <em>scp/sftp</em> to transfer files to your machine and for viewing them in pgprof/nvprof.</p>
@@ -13174,6 +13136,7 @@ div#notebook {
 <span class="k">if</span><span class="p">(</span><span class="ow">not</span> <span class="n">rootdir</span><span class="p">):</span>
     <span class="n">rootdir</span><span class="o">=%</span><span class="k">pwd</span>
 <span class="n">basedir</span><span class="o">=</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">rootdir</span><span class="p">,</span><span class="n">LANGUAGE</span><span class="p">)</span>
+<span class="n">basedirC</span><span class="o">=</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">rootdir</span><span class="p">,</span><span class="s1">&#39;C&#39;</span><span class="p">)</span>
 
 <span class="nb">print</span> <span class="p">(</span><span class="s2">&quot;You selected </span><span class="si">{}</span><span class="s2"> for the exercises.&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">LANGUAGE</span><span class="p">))</span>
 
@@ -13187,6 +13150,8 @@ div#notebook {
         <span class="n">d</span><span class="o">=</span><span class="s1">&#39;</span><span class="si">%s</span><span class="s1">/task</span><span class="si">%i</span><span class="s1">&#39;</span><span class="o">%</span><span class="p">(</span><span class="n">basedir</span><span class="p">,</span><span class="n">t</span><span class="p">)</span>
         <span class="o">%</span><span class="k">cd</span> $d
         <span class="o">!</span>make clean
+        
+<span class="c1">#cleanall()</span>
 </pre></div>
 
     </div>
@@ -13198,25 +13163,36 @@ div#notebook {
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
 <h1 id="Tasks">Tasks<a name="top" /><a class="anchor-link" href="#Tasks">&#182;</a></h1><p>This session comes with multiple tasks. All tasks are available in C or FORTRAN and can be found in the <code>[C|Fortan]/task[0-3]</code> subdirectories. There you will also find Makefiles that are set up so that you can compile and submit all necessary tasks.</p>
-<p>Please choose from the task below.</p>
-<ul>
-<li><p><a href="#task0">Task 0</a> Accelerate a CPU Jacobi solver with OpenACC relying on Unified Memory for data movement using <code>–ta=tesla:managed</code><br>
-<a href="#solution0">Solution 0</a></p>
+<p>Please choose from the task below. <em>If you want to go for the advanced NVSHMEM tasks you should complete Task 2 but can skip Task 3 (or postpone it until the end).</em></p>
+<h3 id="GPU-Programming">GPU Programming<a class="anchor-link" href="#GPU-Programming">&#182;</a></h3><ul>
+<li><p><a href="#task0">Task 0</a> Accelerate a CPU Jacobi solver with OpenACC relying on Unified Memory for data movement using <code>–ta=tesla:managed</code></p>
 </li>
-<li><p><a href="#task1">Task 1</a> Fix memory access pattern of OpenACC accelerated Jacobi Solver<br>
-<a href="#solution1">Solution 1</a></p>
+<li><p><a href="#task1">Task 1</a> Fix memory access pattern of OpenACC accelerated Jacobi Solver</p>
 </li>
-<li><p><a href="#task2">Task 2</a> Use MPI to make OpenACC accelerated Jacobi Solver scale to multiple GPUs<br>
-<a href="#solution2">Solution 2</a></p>
+</ul>
+<h3 id="Multi-GPU-with-MPI">Multi-GPU with MPI<a class="anchor-link" href="#Multi-GPU-with-MPI">&#182;</a></h3><ul>
+<li><p><a href="#task2">Task 2</a> Use MPI to make OpenACC accelerated Jacobi Solver scale to multiple GPUs</p>
 </li>
 <li><p><a href="#task3">Task 3</a> Hide MPI communication time by overlapping communication and 
-  computation in a MPI+OpenACC multi GPU Jacobi Solver<br>
-<a href="#solution3">Solution 3</a></p>
+  computation in a MPI+OpenACC multi GPU Jacobi Solver</p>
 </li>
 </ul>
-<ul>
+<h3 id="Multi-GPU-with-NVSHMEM-(Advanced----C-only)">Multi-GPU with NVSHMEM <em>(Advanced -- C only)</em><a class="anchor-link" href="#Multi-GPU-with-NVSHMEM-(Advanced----C-only)">&#182;</a></h3><ul>
+<li><p><a href="#task4">Task 4</a> Use NVSHMEM instead of MPI</p>
+</li>
+<li><p><a href="#task5">Task 5</a> Put NVSHMEM calls on stream to hide API calls and GPU/CPU synchronization</p>
+</li>
+</ul>
+<h3 id="Survey">Survey<a class="anchor-link" href="#Survey">&#182;</a></h3><ul>
 <li><a href="#survey">Suvery</a> Please remember to take the survey !</li>
 </ul>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
 <h3 id="Make-Targets-">Make Targets <a name="make" /><a class="anchor-link" href="#Make-Targets-">&#182;</a></h3><p>For all tasks we have defined the following make targets.</p>
 <ul>
 <li><strong>run</strong>:<br>
@@ -13254,13 +13230,8 @@ same as above for the solution (e.g. <code>make poisson2d.solution</code> or <co
 <h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><p>You can open the source code either in a terminal in an editor. Navigate to <code>(C|Fortran)/task0/</code> and open <code>poisson2d.c</code> in a editor of your choice.</p>
 <p>If your are using the jupyter approach by following the link (for the language of your choice), This will open the source code in an editor in a new browser tab/window.</p>
 <ul>
-<li><a href="/edit/C/task0/poisson2d.c">C Version</a></li>
-<li><a href="/edit/FORTAN/task0/poisson2d.F03">Fortran Version</a></li>
-</ul>
-<h4 id="File-browser">File browser<a class="anchor-link" href="#File-browser">&#182;</a></h4><p>Can be used to open source files, Makefiles, profiling output.</p>
-<ul>
-<li><a href="/tree/C/task0/">C Version</a></li>
-<li><a href="/tree/FORTRAN/task0/">Fortran Version</a></li>
+<li><a href="./C/task0/poisson2d.c">C Version</a></li>
+<li><a href=".FORTAN/task0/poisson2d.F03">Fortran Version</a></li>
 </ul>
 <p><strong>Before</strong> executing any of the cells below first execute the next cell to change to the right directory.</p>
 
@@ -13320,7 +13291,7 @@ Alternatively you can just navigate to the right directory and execute <code>mak
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling finished the output file <code>poisson2d.pgprof</code>  can be downloaded from here: <a href="/tree/C/task0/poisson2d.pgprof?download=1">C Version</a> / <a href="/tree/FORTRAN/task0/poisson2d.pgprof?download=1">Fortran Version</a>.
+<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling finished the output file <code>poisson2d.pgprof</code> can be downloaded using the file browser.
 Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu.</p>
 
 </div>
@@ -13347,13 +13318,6 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 <li><a href="http://www.openacc.org">http://www.openacc.org</a></li>
 <li><a href="https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf">OpenACC Reference Card</a></li>
 </ol>
-
-</div>
-</div>
-</div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
 <p><a href="#top">Back to Top</a></p>
 <hr>
 
@@ -13367,13 +13331,8 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 There use “Global Memory Access Pattern” experiment to analyze the issue.</p>
 <p><em>Look for</em> <strong>TODOs</strong> in the code.</p>
 <h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
-<li><a href="/edit/C/task1/poisson2d.c">C Version</a></li>
-<li><a href="/edit/FORTRAN/task1/poisson2d.F03">Fortran Version</a></li>
-</ul>
-<h4 id="Directory-browser">Directory browser<a class="anchor-link" href="#Directory-browser">&#182;</a></h4><p>Can be used to open source files, Makefiles, profiling output.</p>
-<ul>
-<li><a href="/tree/C/task1/">C Version</a></li>
-<li><a href="/tree/FORTRAN/task1/">Fortran Version</a></li>
+<li><a href="./C/task1/poisson2d.c">C Version</a></li>
+<li><a href="./FORTRAN/task1/poisson2d.F03">Fortran Version</a></li>
 </ul>
 <p><strong>Before</strong> executing any of the cells below first execute the next cell to change to the right directory.</p>
 
@@ -13433,24 +13392,11 @@ Alternatively you can just navigate to the right directory and execute <code>mak
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling finished the output files can be downloaded from here: <a href="/tree/C/task1/pgprof.poisson2d.Task1.tar.gz?download=1">C Version</a> / <a href="/tree/FORTRAN/task1/pgprof.Task1.poisson2d.tar.gz?download=1">Fortran Version</a>.
+<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. Download the tarball containing the profiles (<code>pgprof.Task1.poisson2d.tar.gz</code>) with the File Browser.
 Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu.</p>
 
 </div>
 </div>
-</div>
-<div class="cell border-box-sizing code_cell rendered">
-<div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
-<div class="inner_cell">
-    <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="o">!</span>ls
-</pre></div>
-
-    </div>
-</div>
-</div>
-
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
@@ -13499,13 +13445,6 @@ If you purely work in a terminal you can view the same output by running <code>p
 <li><a href="https://www.pgroup.com/resources/pgprof-quickstart.htm">pgprof Quickstart</a></li>
 <li><a href="https://docs.nvidia.com/cuda/profiler-users-guide/index.html">CUDA Toolkit Documentation - Profiler</a> <em>pgprof is based on the NVIDIA Visual Profiler</em></li>
 </ol>
-
-</div>
-</div>
-</div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
 <p><a href="#top">Back to Top</a></p>
 <hr>
 
@@ -13523,13 +13462,8 @@ If you purely work in a terminal you can view the same output by running <code>p
 <p><em>Look for</em> <strong>TODOs</strong></p>
 <p>When profiling take a look at how kernel and communication times change when you scale to more GPUs.</p>
 <h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
-<li><a href="/edit/C/task2/poisson2d.c">C Version</a></li>
-<li><a href="/edit/FORTRAN/task2/poisson2d.F03">Fortran Version</a></li>
-</ul>
-<h4 id="File-browser">File browser<a class="anchor-link" href="#File-browser">&#182;</a></h4><p>Can be used to open source files, Makefiles, profiling output.</p>
-<ul>
-<li><a href="/tree/C/task2/">C Version</a></li>
-<li><a href="/tree/FORTRAN/task2/">Fortran Version</a></li>
+<li><a href="./C/task2/poisson2d.c">C Version</a></li>
+<li><a href="./FORTRAN/task2/poisson2d.F03">Fortran Version</a></li>
 </ul>
 <p><strong>Before</strong> executing any of the cells below first execute the next cell to change to the right directory.</p>
 
@@ -13628,7 +13562,7 @@ Alternatively you can just navigate to the right directory and execute <code>mak
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling finished the output files can be downloaded from here: <a href="/tree/C/task2/pgprof.poisson2d.Task2.tar.gz?download=1">C Version</a> / <a href="/tree/FORTRAN/task2/pgprof.poisson2d.Task2.tar.gz?download=1">Fortran Version</a>.
+<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling completed download the tarball containing the profiles (<code>pgprof.Task2.poisson2d.tar.gz</code>) with the File Browser. 
 Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu. Remember to use the <em>Multiple processes</em> option in the assistant.</p>
 
 </div>
@@ -13681,13 +13615,8 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 <p>Compare the scaling and efficiency with the results from the previous task. Check for the overlap in the profiler.</p>
 <p><em>Optional</em>: Try to understand how well communication and compute overlap is able to improve efficiency when scaling to more GPUs.</p>
 <h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
-<li><a href="/edit/C/task3/poisson2d.c">C Version</a></li>
-<li><a href="/edit/FORTRAN/task3/poisson2d.F03">Fortran Version</a></li>
-</ul>
-<h4 id="File-browser">File browser<a class="anchor-link" href="#File-browser">&#182;</a></h4><p>Can be used to open source files, Makefiles, profiling output.</p>
-<ul>
-<li><a href="/tree/C/task3/">C Version</a></li>
-<li><a href="/tree/FORTRAN/task3/">Fortran Version</a></li>
+<li><a href="./C/task3/poisson2d.c">C Version</a></li>
+<li><a href="./FORTRAN/task3/poisson2d.F03">Fortran Version</a></li>
 </ul>
 <p><strong>Before</strong> executing any of the cells below first execute the next cell to change to the right directory.</p>
 
@@ -13786,7 +13715,7 @@ Alternatively you can just navigate to the right directory and execute <code>mak
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling finished the output files can be downloaded from here: <a href="/tree/C/task3/pgprof.poisson2d.Task3.tar.gz?download=1">C Version</a> / <a href="/tree/FORTRAN/task3/pgprof.poisson2d.Task3.tar.gz?download=1">Fortran Version</a>.
+<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling completed download the tarball containing the profiles (<code>pgprof.Task3.poisson2d.tar.gz</code>) with the File Browser. 
 Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu. Remember to use the <em>Multiple processes</em> option in the assistant.</p>
 
 </div>
@@ -13822,6 +13751,21 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
 <hr>
+<h1 id="Tasks-using-NVSHMEM">Tasks using NVSHMEM<a class="anchor-link" href="#Tasks-using-NVSHMEM">&#182;</a></h1><p><strong>The following tasks are using NVSHMEM instead of MPI. NVSHMEM is currently available as early access software. Please read the following carefully before starting these tasks.</strong></p>
+<ul>
+<li><em>NVSHMEM early access 0.3.2</em> is installed on Ascent. It is provided under the license in <a href="./LICENSE_NVSHMEM.md">LICENSE_NVSHMEM.md</a>.</li>
+<li>If you want to continue using the NVHSMEM early access version beyond this tutorial you need to apply for early access at <a href="https://developer.nvidia.com/nvshmem">https://developer.nvidia.com/nvshmem</a></li>
+</ul>
+<hr>
+<p>NVSHMEM enables efficient communication among GPUs.It supports an API for direct communication among GPUs, either initiated by the CPU or by GPUs inside of compute kernels. Inside compute kernels, NVSHMEM also supports direct load/store accesses to remote memory over PCIe or NVLink. The ability to initiate communication from inside kernels eliminates GPU-host-synchronization and associated overheads. It can also benefit from latency tolerance mechanisms available within GPUs. The tasks illustrate that progressing from an MPI-only app to an app that uses NVSHMEM can be straightforward.</p>
+<p><strong>NOTE</strong>: Covering all feature of NVSHMEM, incuding communication calls in kernels, is not easily accessible through OpenACC and also exceed the scope of this tutorial. However, the OpenACC examples should give you a basic introduction to NVSHMEM.</p>
+<p>You can check the developer guide and the other presentations</p>
+<h4 id="References">References<a class="anchor-link" href="#References">&#182;</a></h4><ol>
+<li><a href="http://www.openacc.org">http://www.openacc.org</a></li>
+<li><a href="https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf">OpenACC Reference Card</a></li>
+<li><a href="http://www.openshmem.org/site/sites/default/site_files/OpenSHMEM-1.4.pdf">OpenSHMEM 1.4 Specification</a></li>
+<li><a href="./NVSHMEM-Developer-Guide.pdf">NVSHMEM 0.3 EA Developer Guide</a></li>
+</ol>
 <hr>
 
 </div>
@@ -13830,8 +13774,460 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h1 id="Survey">Survey<a name="survey" /><a class="anchor-link" href="#Survey">&#182;</a></h1><p>Please rememeber to take some time and fill out the survey <a href="http://bit.ly/sc18-eval">http://bit.ly/sc18-eval</a>.
-<img src="eval.png" alt="eval.png"></p>
+<h2 id="Task-4:-Replace-MPI-with-host-side-NVSHMEM">Task 4: <a name="task4" />Replace MPI with host side NVSHMEM<a class="anchor-link" href="#Task-4:-Replace-MPI-with-host-side-NVSHMEM">&#182;</a></h2><p>To replace MPI from <a href="#task2">Task 2</a> with NVSHMEM you will need to connect the NVSHMEM library to MPI and replace all MPI communication calls related to the halo exchange with the corresponding NVSHMEM functions:</p>
+<ul>
+<li>Include NVSHMEM API header (<code>nvshmem.h</code>)</li>
+<li>Include NVSHMEM API extensions header (<code>nvshmemx.h</code>)</li>
+<li>Initialize NVSHMEM and connect to MPI (<code>nvshmemx_init_attr</code>)</li>
+<li>Allocate symmetric memory (<code>nvshmem_alloc</code>) for <code>A</code> on the device and use the OpenACC <code>map</code> function to tell OpenACC to use it as device memory for <code>A</code></li>
+<li>Replace <code>MPI_Sendrecv</code> calls with SHMEM calls (<code>nvshmem_double_put</code>)</li>
+<li>Insert NVSHMEM barriers to ensure correct execution (<code>nvshmem_barrier_all</code>)</li>
+</ul>
+<p><strong>For interoperability with OpenSHMEM NVSHMEM can also be set up to prefix all calls to NVHSMEM with <code>nv</code>. Please make sure to use these version, e.g. use <code>nvshmem_barrier</code> instead of <code>shmem_barrier</code>. The developer guide mostly uses the unprefixed versions.</strong></p>
+<p><em>Look for</em> <strong>TODOs</strong>.</p>
+<h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
+<li><a href="./C/task4/poisson2d.c">C Version</a></li>
+</ul>
+<p><strong>Before</strong> executing any of the cells below first execute the next cell to change to the right directory.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedirC/task4
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Compilation">Compilation<a class="anchor-link" href="#Compilation">&#182;</a></h4><p>If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call <code>make</code> with the desired <a href="#make">target</a>.
+Alternatively you can just navigate to the right directory and execute <code>make &lt;target&gt;</code> in your terminal.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task4&#39;</span><span class="p">)</span>
+<span class="o">!</span>make poisson2d
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Running">Running<a class="anchor-link" href="#Running">&#182;</a></h4><p>For the Multi-GPU version you can set the number of GPUs / MPI ranks using the variable <code>NP</code>. On <em>Ascent</em> within a single node you can use up to 6 GPUs.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task4&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Scaling">Scaling<a class="anchor-link" href="#Scaling">&#182;</a></h4><p>You can do a simple scaling run for up to all 6 GPUs in the node by executing the next cell.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task4&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">1</span> make run <span class="p">|</span> grep speedup &gt; scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run <span class="p">|</span> grep speedup &gt;&gt; scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">4</span> make run <span class="p">|</span> grep speedup &gt;&gt;  scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">6</span> make run <span class="p">|</span> grep speedup &gt;&gt;  scale.out
+<span class="n">data_frame4</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;scale.out&#39;</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
+
+<span class="o">!</span>rm scale.out
+
+<span class="n">data_frame4b</span><span class="o">=</span><span class="n">data_frame4</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,[</span><span class="mi">5</span><span class="p">,</span><span class="mi">7</span><span class="p">,</span><span class="mi">10</span><span class="p">,</span><span class="mi">12</span><span class="p">]]</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
+<span class="n">data_frame4b</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="mi">5</span><span class="p">:</span><span class="s1">&#39;GPUs&#39;</span><span class="p">,</span> <span class="mi">7</span><span class="p">:</span> <span class="s1">&#39;time [s]&#39;</span><span class="p">,</span> <span class="mi">10</span><span class="p">:</span><span class="s1">&#39;speedup&#39;</span><span class="p">,</span> <span class="mi">12</span><span class="p">:</span><span class="s1">&#39;efficiency&#39;</span><span class="p">})</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling completed download the tarball containing the profiles (<code>pgprof.Task4.poisson2d.tar.gz</code>) with the File Browser. 
+Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu. Remember to use the <em>Multiple processes</em> option in the assistant.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task4&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make profile
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="References">References<a class="anchor-link" href="#References">&#182;</a></h4><ol>
+<li><a href="http://www.openacc.org">http://www.openacc.org</a></li>
+<li><a href="https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf">OpenACC Reference Card</a></li>
+<li><a href="http://www.openshmem.org/site/sites/default/site_files/OpenSHMEM-1.4.pdf">OpenSHMEM 1.4 Specification</a></li>
+<li><a href="./NVSHMEM-Developer-Guide.pdf">NVSHMEM 0.3 EA Developer Guide</a></li>
+</ol>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h2 id="Task-5:-Make-communication-asynchronous">Task 5: <a name="task5" />Make communication asynchronous<a class="anchor-link" href="#Task-5:-Make-communication-asynchronous">&#182;</a></h2><p>NVSHMEM allows you to put communications in <em>CUDA streams / OpenACC async queues</em>. This allows the CPU already set up communication and kernel launches while the GPU is still communicationg, effectively hiding the time spend in API calls.</p>
+<p>To do this you need to:</p>
+<ul>
+<li>use the <code>async</code> and <code>wait</code> keywords in the OpenACC pragmas to excute the kernels asynchronously in the OpenACC default queu</li>
+<li>replace <code>nvshmem_double_put</code> calls with the <code>nvhsmemx_double_put_on_stream</code> version.<br>
+use <code>use acc_get_cuda_stream</code> and <code>acc_get_default_async</code> to get the <code>cudaStream_t cudaStream</code> corresponding to the OpenACC default async queue.</li>
+<li>make sure to synchronize before copying the data back to the CPU</li>
+</ul>
+<p><em>Look for</em> <strong>TODOs</strong>.</p>
+<p>Compare the scaling and efficiency with the results from the previous task and the MPI versions. Check for asynchronous execution in the profiler.</p>
+<p><em>Optional</em>: Try to understand how well communication and compute overlap is able to improve efficiency when scaling to more GPUs.</p>
+<h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
+<li><a href="./C/task5/poisson2d.c">C Version</a></li>
+</ul>
+<p><strong>Before</strong> executing any of the cells below first execute the next cell to change to the right directory.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedirC/task5
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Compilation">Compilation<a class="anchor-link" href="#Compilation">&#182;</a></h4><p>If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call <code>make</code> with the desired <a href="#make">target</a>.
+Alternatively you can just navigate to the right directory and execute <code>make &lt;target&gt;</code> in your terminal.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task5&#39;</span><span class="p">)</span>
+<span class="o">!</span>make poisson2d
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Running">Running<a class="anchor-link" href="#Running">&#182;</a></h4><p>For the Multi-GPU version you can set the number of GPUs / MPI ranks using the variable <code>NP</code>. On <em>Ascent</em> within a single node you can use up to 6 GPUs.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task5&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Scaling">Scaling<a class="anchor-link" href="#Scaling">&#182;</a></h4><p>You can do a simple scaling run for up to all 6 GPUs in the node by executing the next cell.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task5&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">1</span> make run <span class="p">|</span> grep speedup &gt; scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run <span class="p">|</span> grep speedup &gt;&gt; scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">4</span> make run <span class="p">|</span> grep speedup &gt;&gt;  scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">6</span> make run <span class="p">|</span> grep speedup &gt;&gt;  scale.out
+<span class="n">data_frame5</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;scale.out&#39;</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
+
+<span class="o">!</span>rm scale.out
+
+<span class="n">data_frame5b</span><span class="o">=</span><span class="n">data_frame5</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,[</span><span class="mi">5</span><span class="p">,</span><span class="mi">7</span><span class="p">,</span><span class="mi">10</span><span class="p">,</span><span class="mi">12</span><span class="p">]]</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
+<span class="n">data_frame5b</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="mi">5</span><span class="p">:</span><span class="s1">&#39;GPUs&#39;</span><span class="p">,</span> <span class="mi">7</span><span class="p">:</span> <span class="s1">&#39;time [s]&#39;</span><span class="p">,</span> <span class="mi">10</span><span class="p">:</span><span class="s1">&#39;speedup&#39;</span><span class="p">,</span> <span class="mi">12</span><span class="p">:</span><span class="s1">&#39;efficiency&#39;</span><span class="p">})</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling completed download the tarball containing the profiles (<code>pgprof.Task5.poisson2d.tar.gz</code>) with the File Browser. 
+Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu. Remember to use the <em>Multiple processes</em> option in the assistant.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task5&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make profile
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="References">References<a class="anchor-link" href="#References">&#182;</a></h4><ol>
+<li><a href="http://www.openacc.org">http://www.openacc.org</a></li>
+<li><a href="https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf">OpenACC Reference Card</a></li>
+<li><a href="http://www.openshmem.org/site/sites/default/site_files/OpenSHMEM-1.4.pdf">OpenSHMEM 1.4 Specification</a></li>
+<li><a href="./NVSHMEM-Developer-Guide.pdf">NVSHMEM 0.3 EA Developer Guide</a></li>
+</ol>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h2 id="Task-6:-Use-direct-load/store-to-remote-memory">Task 6: <a name="task5" />Use direct load/store to remote memory<a class="anchor-link" href="#Task-6:-Use-direct-load/store-to-remote-memory">&#182;</a></h2><p>NVSHMEM allows you to put communications in the GPU kernels. Howerver, the <code>nvhsmem_put / nvshmem_get</code> calls are not easily avilable in OpenACC kernels. However, for <em>intranode</em> communication when all GPUs can use P2P (as in the nodes in Ascent and Summit) you can get a pointer to a remote GPUs memory using <code>nvshmem_ptr</code>.</p>
+<p>To do this you need to:</p>
+<ul>
+<li>use the <code>nvshmem_ptr</code> to get pointers to your neighboring (top/bottom) <code>d_A</code> allocation</li>
+<li>when setting <code>A</code> to <code>Anew</code> also update the halos of your neighbors. You need to use the <code>deviceptr</code> keyword to use <code>d_Atop / d_Abottom</code> device pointers in an OpenACC directly.</li>
+<li><p>add the needed <code>nvhsmem_barrier</code>.</p>
+</li>
+<li><p>Additional task: Similar to the previous version you can use asynchronous execution here.</p>
+</li>
+</ul>
+<p><em>Look for</em> <strong>TODOs</strong>.</p>
+<p>Compare the scaling and efficiency with the results from the previous tasks and the MPI versions. Check for asynchronous execution in the profiler.</p>
+<h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
+<li><a href="./C/task6/poisson2d.c">C Version</a></li>
+</ul>
+<p><strong>Before</strong> executing any of the cells below first execute the next cell to change to the right directory.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedirC/task6
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Compilation">Compilation<a class="anchor-link" href="#Compilation">&#182;</a></h4><p>If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call <code>make</code> with the desired <a href="#make">target</a>.
+Alternatively you can just navigate to the right directory and execute <code>make &lt;target&gt;</code> in your terminal.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task6&#39;</span><span class="p">)</span>
+<span class="o">!</span>make poisson2d
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Running">Running<a class="anchor-link" href="#Running">&#182;</a></h4><p>For the Multi-GPU version you can set the number of GPUs / MPI ranks using the variable <code>NP</code>. On <em>Ascent</em> within a single node you can use up to 6 GPUs.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task6&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Scaling">Scaling<a class="anchor-link" href="#Scaling">&#182;</a></h4><p>You can do a simple scaling run for up to all 6 GPUs in the node by executing the next cell.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task6&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">1</span> make run <span class="p">|</span> grep speedup &gt; scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run <span class="p">|</span> grep speedup &gt;&gt; scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">4</span> make run <span class="p">|</span> grep speedup &gt;&gt;  scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">6</span> make run <span class="p">|</span> grep speedup &gt;&gt;  scale.out
+<span class="n">data_frame5</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;scale.out&#39;</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
+
+<span class="o">!</span>rm scale.out
+
+<span class="n">data_frame5b</span><span class="o">=</span><span class="n">data_frame5</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,[</span><span class="mi">5</span><span class="p">,</span><span class="mi">7</span><span class="p">,</span><span class="mi">10</span><span class="p">,</span><span class="mi">12</span><span class="p">]]</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
+<span class="n">data_frame5b</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="mi">5</span><span class="p">:</span><span class="s1">&#39;GPUs&#39;</span><span class="p">,</span> <span class="mi">7</span><span class="p">:</span> <span class="s1">&#39;time [s]&#39;</span><span class="p">,</span> <span class="mi">10</span><span class="p">:</span><span class="s1">&#39;speedup&#39;</span><span class="p">,</span> <span class="mi">12</span><span class="p">:</span><span class="s1">&#39;efficiency&#39;</span><span class="p">})</span>
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling completed download the tarball containing the profiles (<code>pgprof.Task6.poisson2d.tar.gz</code>) with the File Browser. 
+Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu. Remember to use the <em>Multiple processes</em> option in the assistant.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task6&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make profile
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="References">References<a class="anchor-link" href="#References">&#182;</a></h4><ol>
+<li><a href="http://www.openacc.org">http://www.openacc.org</a></li>
+<li><a href="https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf">OpenACC Reference Card</a></li>
+<li><a href="http://www.openshmem.org/site/sites/default/site_files/OpenSHMEM-1.4.pdf">OpenSHMEM 1.4 Specification</a></li>
+<li><a href="./NVSHMEM-Developer-Guide.pdf">NVSHMEM 0.3 EA Developer Guide</a></li>
+</ol>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<hr>
+<hr>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<hr>
+<h1 id="Survey">Survey<a name="survey" /><a class="anchor-link" href="#Survey">&#182;</a></h1><p>Please remember to take some time and fill out the survey<a href="http://bit.ly/sc19-eval">http://bit.ly/sc19-eval</a>.</p>
+<p><img src="./resources/eval.png" alt="eval.png"></p>
 
 </div>
 </div>
diff --git a/4-GPU/HandsOn/HandsOnGPUProgramming.ipynb b/4-GPU/HandsOn/HandsOnGPUProgramming.ipynb
index b354b24e3dabc50aba9f5b0644b14ffc92dfde58..64f9d3d32678af6971c4a3b0b5295c91f31aa62e 100644
--- a/4-GPU/HandsOn/HandsOnGPUProgramming.ipynb
+++ b/4-GPU/HandsOn/HandsOnGPUProgramming.ipynb
@@ -5,29 +5,32 @@
    "metadata": {},
    "source": [
     "# Hands-On GPU Programming\n",
-    "_Supercomputing 2018 Tutorial \"Application Porting and Optimization on GPU-Accelerated POWER Architectures\", November 12th 2018_\n",
+    "_Supercomputing 2019 Tutorial \"Application Porting and Optimization on GPU-Accelerated POWER Architectures\", November 18th 2019_\n",
     "\n",
     "---"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "### Read me first\n",
     "\n",
     "This tutorial is primarily designed to be executed as a _jupyter_ notebook. However, everything can also be done using an _ssh_ connection to _ascent.olcf.ornl.gov_ in your terminal.\n",
     "\n",
-    "#### Jupyter notebook execution\n",
+    "#### Jupyter Lab execution\n",
     "\n",
     "When using jupyter this notebook will guide you through the step. Note that if you execute a cell multiple times while optimizing the code the output will be replaced. You can however duplicate the cell you want to execute and keep its output. Check the _edit_ menu above.\n",
     "\n",
     "You will always find links to a file browser of the corresponding task subdirectory as well as direct links to the source files you will need to edit as well as the profiling output you need to open locally.\n",
     "\n",
-    "If you want you also can get a [terminal](/terminals/4) in your browser.\n",
+    "If you want you also can get a terminal in your browser by following the *File -> New -> Terminal* in the Jupyter Lab menu bar.\n",
     "\n",
     "#### Terminal fallback\n",
-    "The tasks are placed in directories named `[C/FORTRAN]/task[0-3]`.\n",
+    "The tasks are placed in directories named `[C/FORTRAN]/task[0-6]`.<br>\n",
+    "*Note: The tasks using NVHSMEM (4-6) are only available in C.* \n",
     "\n",
     "The files you will need to edit are always the `poisson2d.(C|F03)` files.\n",
     "\n",
@@ -77,6 +80,7 @@
     "if(not rootdir):\n",
     "    rootdir=%pwd\n",
     "basedir=os.path.join(rootdir,LANGUAGE)\n",
+    "basedirC=os.path.join(rootdir,'C')\n",
     "\n",
     "print (\"You selected {} for the exercises.\".format(LANGUAGE))\n",
     "\n",
@@ -89,36 +93,57 @@
     "    for t in range(4):\n",
     "        d='%s/task%i'%(basedir,t)\n",
     "        %cd $d\n",
-    "        !make clean"
+    "        !make clean\n",
+    "        \n",
+    "#cleanall()"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "# Tasks<a name=\"top\"></a>\n",
     "\n",
     "This session comes with multiple tasks. All tasks are available in C or FORTRAN and can be found in the `[C|Fortan]/task[0-3]` subdirectories. There you will also find Makefiles that are set up so that you can compile and submit all necessary tasks.\n",
     "\n",
-    "Please choose from the task below.\n",
+    "Please choose from the task below. *If you want to go for the advanced NVSHMEM tasks you should complete Task 2 but can skip Task 3 (or postpone it until the end).*\n",
     "\n",
     "\n",
+    "### GPU Programming\n",
+    "\n",
     "* [Task 0](#task0) Accelerate a CPU Jacobi solver with OpenACC relying on Unified Memory for data movement using `–ta=tesla:managed`  \n",
-    "  [Solution 0](#solution0)\n",
     "\n",
     "* [Task 1](#task1) Fix memory access pattern of OpenACC accelerated Jacobi Solver  \n",
-    "  [Solution 1](#solution1)\n",
+    "\n",
+    "\n",
+    "### Multi-GPU with MPI\n",
     "\n",
     "* [Task 2](#task2) Use MPI to make OpenACC accelerated Jacobi Solver scale to multiple GPUs  \n",
-    "  [Solution 2](#solution2)\n",
     "\n",
     "* [Task 3](#task3) Hide MPI communication time by overlapping communication and \n",
     "\tcomputation in a MPI+OpenACC multi GPU Jacobi Solver  \n",
-    "  [Solution 3](#solution3)\n",
+    "\n",
     "  \n",
+    "### Multi-GPU with NVSHMEM *(Advanced -- C only)*\n",
     "  \n",
-    "* [Suvery](#survey) Please remember to take the survey !\n",
-    "    \n",
+    "* [Task 4](#task4) Use NVSHMEM instead of MPI  \n",
+    "\n",
+    "* [Task 5](#task5) Put NVSHMEM calls on stream to hide API calls and GPU/CPU synchronization  \n",
+    "\n",
+    "\n",
+    "### Survey\n",
+    " \n",
+    " * [Suvery](#survey) Please remember to take the survey !"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "task"
+   },
+   "source": [
     "### Make Targets <a name=\"make\"></a>\n",
     "\n",
     "For all tasks we have defined the following make targets. \n",
@@ -135,7 +160,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "[Back to Top](#top)\n",
     "\n",
@@ -144,7 +171,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "## Task 0: <a name=\"task0\"></a>Using OpenACC\n",
     "\n",
@@ -168,14 +197,9 @@
     "\n",
     "If your are using the jupyter approach by following the link (for the language of your choice), This will open the source code in an editor in a new browser tab/window.\n",
     "\n",
-    "* [C Version](/edit/C/task0/poisson2d.c)\n",
-    "* [Fortran Version](/edit/FORTAN/task0/poisson2d.F03)\n",
+    "* [C Version](./C/task0/poisson2d.c)\n",
+    "* [Fortran Version](.FORTAN/task0/poisson2d.F03)\n",
     "\n",
-    "#### File browser\n",
-    "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task0/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task0/)\n",
     "\n",
     "__Before__ executing any of the cells below first execute the next cell to change to the right directory."
    ]
@@ -191,7 +215,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### Compilation and Execution\n",
     "\n",
@@ -202,7 +228,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task0')\n",
@@ -212,7 +240,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task0')\n",
@@ -221,11 +251,13 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### Profiling\n",
     "\n",
-    "You can profile the code by executing the next cell. __After__ the profiling finished the output file `poisson2d.pgprof`  can be downloaded from here: [C Version](/tree/C/task0/poisson2d.pgprof?download=1) / [Fortran Version](/tree/FORTRAN/task0/poisson2d.pgprof?download=1).\n",
+    "You can profile the code by executing the next cell. __After__ the profiling finished the output file `poisson2d.pgprof` can be downloaded using the file browser.\n",
     "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu.     \n",
     "    "
    ]
@@ -233,7 +265,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task0')\n",
@@ -242,18 +276,15 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### References\n",
     "\n",
     "1. http://www.openacc.org\n",
-    "2. [OpenACC Reference Card](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
+    "2. [OpenACC Reference Card](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
+    "\n",
     "[Back to Top](#top)\n",
     "\n",
     "---"
@@ -261,7 +292,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "## Task 1:<a name=\"task1\"></a> Memory Access Patterns\n",
     "\n",
@@ -275,14 +308,9 @@
     "\n",
     "#### Code\n",
     "\n",
-    "* [C Version](/edit/C/task1/poisson2d.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task1/poisson2d.F03)\n",
+    "* [C Version](./C/task1/poisson2d.c)\n",
+    "* [Fortran Version](./FORTRAN/task1/poisson2d.F03)\n",
     "\n",
-    "#### Directory browser\n",
-    "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task1/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task1/)\n",
     "\n",
     "__Before__ executing any of the cells below first execute the next cell to change to the right directory."
    ]
@@ -290,7 +318,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "%cd $basedir/task1"
@@ -298,7 +328,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### Compilation and Execution\n",
     "\n",
@@ -309,7 +341,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task1')\n",
@@ -319,7 +353,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task1')\n",
@@ -328,11 +364,13 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### Profiling\n",
     "\n",
-    "You can profile the code by executing the next cell. __After__ the profiling finished the output files can be downloaded from here: [C Version](/tree/C/task1/pgprof.poisson2d.Task1.tar.gz?download=1) / [Fortran Version](/tree/FORTRAN/task1/pgprof.Task1.poisson2d.tar.gz?download=1).\n",
+    "You can profile the code by executing the next cell. Download the tarball containing the profiles (`pgprof.Task1.poisson2d.tar.gz`) with the File Browser.\n",
     "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu.     \n",
     "    "
    ]
@@ -340,16 +378,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!ls"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task1')\n",
@@ -358,7 +389,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "For the _Global Memory Load/Store Efficiency_ the `make profile` command also generated a CSV file that you can import and view with the cell below.  \n",
     "If you purely work in a terminal you can view the same output by running `pgprof -i poisson2d.efficiency.pgprof`."
@@ -367,7 +400,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task1')\n",
@@ -377,20 +412,17 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### References\n",
     "\n",
     "1. http://www.openacc.org\n",
     "2. [OpenACC Reference Card](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
     "3. [pgprof Quickstart](https://www.pgroup.com/resources/pgprof-quickstart.htm)\n",
-    "4. [CUDA Toolkit Documentation - Profiler](https://docs.nvidia.com/cuda/profiler-users-guide/index.html) _pgprof is based on the NVIDIA Visual Profiler_"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
+    "4. [CUDA Toolkit Documentation - Profiler](https://docs.nvidia.com/cuda/profiler-users-guide/index.html) _pgprof is based on the NVIDIA Visual Profiler_\n",
+    "\n",
     "[Back to Top](#top)\n",
     "\n",
     "---"
@@ -398,7 +430,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "## Task 2: <a name=\"task2\"></a>Apply Domain Decomposition\n",
     "\n",
@@ -415,14 +449,9 @@
     "\n",
     "#### Code\n",
     "\n",
-    "* [C Version](/edit/C/task2/poisson2d.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task2/poisson2d.F03)\n",
+    "* [C Version](./C/task2/poisson2d.c)\n",
+    "* [Fortran Version](./FORTRAN/task2/poisson2d.F03)\n",
     "\n",
-    "#### File browser\n",
-    "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task2/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task2/)\n",
     "\n",
     "__Before__ executing any of the cells below first execute the next cell to change to the right directory."
    ]
@@ -430,7 +459,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "%cd $basedir/task2"
@@ -438,7 +469,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### Compilation\n",
     "\n",
@@ -449,7 +482,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task2')\n",
@@ -458,7 +493,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### Running\n",
     "\n",
@@ -469,7 +506,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "scrolled": true
+    "exercise": "task"
    },
    "outputs": [],
    "source": [
@@ -479,7 +516,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### Scaling\n",
     "\n",
@@ -489,7 +528,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task2')\n",
@@ -507,11 +548,13 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### Profiling\n",
     "\n",
-    "You can profile the code by executing the next cell. __After__ the profiling finished the output files can be downloaded from here: [C Version](/tree/C/task2/pgprof.poisson2d.Task2.tar.gz?download=1) / [Fortran Version](/tree/FORTRAN/task2/pgprof.poisson2d.Task2.tar.gz?download=1).\n",
+    "You can profile the code by executing the next cell. __After__ the profiling completed download the tarball containing the profiles (`pgprof.Task2.poisson2d.tar.gz`) with the File Browser. \n",
     "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu. Remember to use the _Multiple processes_ option in the assistant.      \n",
     "    "
    ]
@@ -519,7 +562,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task2')\n",
@@ -528,7 +573,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### References\n",
     "1. http://www.openacc.org\n",
@@ -538,7 +585,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "[Back to Top](#top)\n",
     "\n",
@@ -547,7 +596,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "## Task 3: <a name=\"task3\"></a>Hide MPI Communication time\n",
     "\n",
@@ -564,14 +615,9 @@
     "\n",
     "#### Code\n",
     "\n",
-    "* [C Version](/edit/C/task3/poisson2d.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task3/poisson2d.F03)\n",
+    "* [C Version](./C/task3/poisson2d.c)\n",
+    "* [Fortran Version](./FORTRAN/task3/poisson2d.F03)\n",
     "\n",
-    "#### File browser\n",
-    "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task3/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task3/)\n",
     "\n",
     "__Before__ executing any of the cells below first execute the next cell to change to the right directory."
    ]
@@ -579,7 +625,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "%cd $basedir/task3"
@@ -587,7 +635,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### Compilation\n",
     "\n",
@@ -598,7 +648,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task3')\n",
@@ -607,7 +659,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### Running\n",
     "\n",
@@ -617,7 +671,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task3')\n",
@@ -626,7 +682,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### Scaling\n",
     "\n",
@@ -636,7 +694,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task3')\n",
@@ -654,19 +714,23 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### Profiling\n",
     "\n",
-    "You can profile the code by executing the next cell. __After__ the profiling finished the output files can be downloaded from here: [C Version](/tree/C/task3/pgprof.poisson2d.Task3.tar.gz?download=1) / [Fortran Version](/tree/FORTRAN/task3/pgprof.poisson2d.Task3.tar.gz?download=1).\n",
-    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu. Remember to use the _Multiple processes_ option in the assistant.   \n",
+    "You can profile the code by executing the next cell. __After__ the profiling completed download the tarball containing the profiles (`pgprof.Task3.poisson2d.tar.gz`) with the File Browser. \n",
+    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu. Remember to use the _Multiple processes_ option in the assistant.    \n",
     "    "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
     "checkdir('task3')\n",
@@ -675,7 +739,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### References\n",
     "1. http://www.openacc.org\n",
@@ -685,493 +751,449 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "***\n",
-    "***"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "# Solutions\n",
+    "---\n",
+    "# Tasks using NVSHMEM \n",
     "\n",
-    "Below are suggested solutions. This is only a short description of the solution, but the `poisson2d.solution.(c|F03)` files linked below have the full source code. If you want to run / profile the solutions feel free to duplicate the cells for the tasks and change the [make target](#make) to the `*.solution` ones."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "[Back to Top](#top)\n",
+    "**The following tasks are using NVSHMEM instead of MPI. NVSHMEM is currently available as early access software. Please read the following carefully before starting these tasks.**\n",
+    "\n",
+    "* *NVSHMEM early access 0.3.2* is installed on Ascent. It is provided under the license in [LICENSE_NVSHMEM.md](./LICENSE_NVSHMEM.md).\n",
+    "* If you want to continue using the NVHSMEM early access version beyond this tutorial you need to apply for early access at https://developer.nvidia.com/nvshmem\n",
+    "\n",
+    "---\n",
+    "\n",
+    "NVSHMEM enables efficient communication among GPUs.It supports an API for direct communication among GPUs, either initiated by the CPU or by GPUs inside of compute kernels. Inside compute kernels, NVSHMEM also supports direct load/store accesses to remote memory over PCIe or NVLink. The ability to initiate communication from inside kernels eliminates GPU-host-synchronization and associated overheads. It can also benefit from latency tolerance mechanisms available within GPUs. The tasks illustrate that progressing from an MPI-only app to an app that uses NVSHMEM can be straightforward.\n",
+    "\n",
+    "**NOTE**: Covering all feature of NVSHMEM, incuding communication calls in kernels, is not easily accessible through OpenACC and also exceed the scope of this tutorial. However, the OpenACC examples should give you a basic introduction to NVSHMEM.\n",
+    "\n",
+    "You can check the developer guide and the other presentations \n",
+    "\n",
+    "#### References\n",
+    "1. http://www.openacc.org\n",
+    "2. [OpenACC Reference Card](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
+    "3. [OpenSHMEM 1.4 Specification](http://www.openshmem.org/site/sites/default/site_files/OpenSHMEM-1.4.pdf)\n",
+    "4. [NVSHMEM 0.3 EA Developer Guide](./NVSHMEM-Developer-Guide.pdf)\n",
     "\n",
     "---"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "## Solution 0:<a name=\"solution0\"></a>\n",
-    "\n",
-    "```C++\n",
-    "#pragma acc parallel loop\n",
-    "for (int ix = ix_start; ix < ix_end; ix++)\n",
-    "{\n",
-    "    #pragma acc loop\n",
-    "    for( int iy = iy_start; iy < iy_end; iy++ )\n",
-    "    {\n",
-    "        Anew[iy*nx+ix] = -0.25 * (rhs[iy*nx+ix] - ( A[iy*nx+ix+1] + A[iy*nx+ix-1]\n",
-    "                                               + A[(iy-1)*nx+ix] + A[(iy+1)*nx+ix] ));\n",
-    "        error = fmaxr( error, fabsr(Anew[iy*nx+ix]-A[iy*nx+ix]));\n",
-    "    }\n",
-    "}\n",
-    "```\n",
+    "## Task 4: <a name=\"task4\"></a>Replace MPI with host side NVSHMEM\n",
+    "\n",
+    "To replace MPI from [Task 2](#task2) with NVSHMEM you will need to connect the NVSHMEM library to MPI and replace all MPI communication calls related to the halo exchange with the corresponding NVSHMEM functions: \n",
+    "\n",
+    "* Include NVSHMEM API header (<code>nvshmem.h</code>)\n",
+    "* Include NVSHMEM API extensions header (<code>nvshmemx.h</code>)\n",
+    "* Initialize NVSHMEM and connect to MPI (<code>nvshmemx_init_attr</code>)\n",
+    "* Allocate symmetric memory (`nvshmem_alloc`) for `A` on the device and use the OpenACC `map` function to tell OpenACC to use it as device memory for `A`\n",
+    "* Replace <code>MPI_Sendrecv</code> calls with SHMEM calls (<code>nvshmem_double_put</code>)\n",
+    "* Insert NVSHMEM barriers to ensure correct execution (<code>nvshmem_barrier_all</code>)\n",
+    "\n",
+    "\n",
+    "\n",
+    "**For interoperability with OpenSHMEM NVSHMEM can also be set up to prefix all calls to NVHSMEM with `nv`. Please make sure to use these version, e.g. use `nvshmem_barrier` instead of `shmem_barrier`. The developer guide mostly uses the unprefixed versions.**\n",
+    "\n",
+    "_Look for_ __TODOs__.\n",
+    "\n",
+    "\n",
     "\n",
     "#### Code\n",
     "\n",
-    "* [C Version](/edit/C/task0/poisson2d.solution.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task0/poisson2d.solution.F03)\n",
+    "* [C Version](./C/task4/poisson2d.c)\n",
     "\n",
-    "#### File browser\n",
     "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task0/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task0/)"
+    "__Before__ executing any of the cells below first execute the next cell to change to the right directory."
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "exercise": "task"
+   },
+   "outputs": [],
    "source": [
-    "#### Compiling, Running and Profiling\n",
-    "\n",
-    "You can compile, run and profile the solution with the next cells.  __After__ the profiling finished the output file `poisson2d.solution.pgprof`  can be downloaded from here: [C Version](/tree/C/task0/poisson2d.solution.pgprof?download=1) / [Fortran Version](/tree/FORTRAN/task0/poisson2d.solution.pgprof?download=1).    "
+    "%cd $basedirC/task4"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "%cd $basedir/task0"
+    "#### Compilation\n",
+    "\n",
+    "If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call `make` with the desired [target](#make).\n",
+    "Alternatively you can just navigate to the right directory and execute `make <target>` in your terminal."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
-    "checkdir('task0')\n",
-    "!make poisson2d.solution"
+    "checkdir('task4')\n",
+    "!make poisson2d"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "checkdir('task0')\n",
-    "!make run.solution"
+    "#### Running\n",
+    "\n",
+    "For the Multi-GPU version you can set the number of GPUs / MPI ranks using the variable `NP`. On _Ascent_ within a single node you can use up to 6 GPUs."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
-    "checkdir('task0')\n",
-    "!make profile.solution"
+    "checkdir('task4')\n",
+    "!NP=2 make run"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "[Back to Top](#top)\n",
+    "#### Scaling\n",
     "\n",
-    "---"
+    "You can do a simple scaling run for up to all 6 GPUs in the node by executing the next cell."
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "exercise": "task"
+   },
+   "outputs": [],
    "source": [
-    "## Solution 1:<a name=\"solution1\"></a>\n",
-    "\n",
-    "Swap the `ix` and `iy` loops to make sure that `ix` is the fastest running index \n",
-    "\n",
-    "```C\n",
-    "#pragma acc parallel loop\n",
-    "for (int iy = iy_start; iy < iy_end; iy++)\n",
-    "{\n",
-    "    for( int ix = ix_start; ix < ix_end; ix++ )\n",
-    "    {\n",
-    "        Anew[iy*nx+ix] = -0.25 * (rhs[iy*nx+ix] - ( A[iy*nx+ix+1] + A[iy*nx+ix-1]\n",
-    "                                               + A[(iy-1)*nx+ix] + A[(iy+1)*nx+ix] ));\n",
-    "        error = fmaxr( error, fabsr(Anew[iy*nx+ix]-A[iy*nx+ix]));\n",
-    "    }\n",
-    "}\n",
-    "```\n",
-    "\n",
-    "#### Code\n",
-    "\n",
-    "* [C Version](/edit/C/task1/poisson2d.solution.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task1/poisson2d.solution.F03)\n",
+    "checkdir('task4')\n",
+    "!NP=1 make run | grep speedup > scale.out\n",
+    "!NP=2 make run | grep speedup >> scale.out\n",
+    "!NP=4 make run | grep speedup >>  scale.out\n",
+    "!NP=6 make run | grep speedup >>  scale.out\n",
+    "data_frame4 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
     "\n",
-    "#### File browser\n",
+    "!rm scale.out\n",
     "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task1/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task1/)"
+    "data_frame4b=data_frame4.iloc[:,[5,7,10,12]].copy()\n",
+    "data_frame4b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "#### Compiling, Running and Profiling\n",
+    "#### Profiling\n",
     "\n",
-    "You can compile, run and profile the solution with the next cells.  __After__ the profiling finished the output file `poisson2d.solution.pgprof`  can be downloaded from here: [C Version](/tree/C/task1/pgprof.poisson2d.Task1.solution.tar.gz?download=1) / [Fortran Version](/tree/FORTRAN/task1/pgprof.poisson2d.Task1.solution.tar.gz?download=1).  "
+    "You can profile the code by executing the next cell. __After__ the profiling completed download the tarball containing the profiles (`pgprof.Task4.poisson2d.tar.gz`) with the File Browser. \n",
+    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu. Remember to use the _Multiple processes_ option in the assistant.  \n",
+    "    "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
-    "%cd $basedir/task1"
+    "checkdir('task4')\n",
+    "!NP=2 make profile"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "checkdir('task1')\n",
-    "!make poisson2d.solution"
+    "#### References\n",
+    "1. http://www.openacc.org\n",
+    "2. [OpenACC Reference Card](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
+    "3. [OpenSHMEM 1.4 Specification](http://www.openshmem.org/site/sites/default/site_files/OpenSHMEM-1.4.pdf)\n",
+    "4. [NVSHMEM 0.3 EA Developer Guide](./NVSHMEM-Developer-Guide.pdf)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "checkdir('task1')\n",
-    "!make run.solution"
+    "## Task 5: <a name=\"task5\"></a>Make communication asynchronous\n",
+    "\n",
+    "NVSHMEM allows you to put communications in *CUDA streams / OpenACC async queues*. This allows the CPU already set up communication and kernel launches while the GPU is still communicationg, effectively hiding the time spend in API calls.\n",
+    "\n",
+    "To do this you need to:\n",
+    "* use the `async` and `wait` keywords in the OpenACC pragmas to excute the kernels asynchronously in the OpenACC default queu\n",
+    "* replace `nvshmem_double_put` calls with the `nvhsmemx_double_put_on_stream` version.<br>\n",
+    "  use `use acc_get_cuda_stream` and `acc_get_default_async` to get the `cudaStream_t cudaStream` corresponding to the OpenACC default async queue.\n",
+    "* make sure to synchronize before copying the data back to the CPU\n",
+    "\n",
+    "_Look for_ __TODOs__.\n",
+    "\n",
+    "Compare the scaling and efficiency with the results from the previous task and the MPI versions. Check for asynchronous execution in the profiler.\n",
+    "\n",
+    "_Optional_: Try to understand how well communication and compute overlap is able to improve efficiency when scaling to more GPUs.\n",
+    "\n",
+    "#### Code\n",
+    "\n",
+    "* [C Version](./C/task5/poisson2d.c)\n",
+    "\n",
+    "\n",
+    "\n",
+    "__Before__ executing any of the cells below first execute the next cell to change to the right directory."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
-    "checkdir('task1')\n",
-    "!make profile.solution"
+    "%cd $basedirC/task5"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "For the _Global Memory Load/Store Efficiency_ the `make profile` command also generated a CSV file that you can import and view with the cell below.  \n",
-    "If you purely work in a terminal you can view the same output by running `pgprof -i poisson2d.efficiency.solution.pgprof`."
+    "#### Compilation\n",
+    "\n",
+    "If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call `make` with the desired [target](#make).\n",
+    "Alternatively you can just navigate to the right directory and execute `make <target>` in your terminal."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
-    "data_frame_solution = pandas.read_csv('poisson2d.solution.efficiency.csv', sep=',')\n",
-    "data_frame_solution"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "[Back to Top](#top)\n",
-    "\n",
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Solution 2:<a name=\"solution2\"></a>\n",
-    "\n",
-    "Set the GPU used by the rank using `#pragma acc set device_num`\n",
-    "```C\n",
-    "//Initialize MPI and determine rank and size\n",
-    "MPI_Init(&argc, &argv);\n",
-    "MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n",
-    "MPI_Comm_size(MPI_COMM_WORLD, &size);\n",
-    "\n",
-    "#pragma acc set device_num( rank )\n",
-    "\n",
-    "real* restrict const A    = (real*) malloc(nx*ny*sizeof(real));\n",
-    "real* restrict const Aref = (real*) malloc(nx*ny*sizeof(real));\n",
-    "real* restrict const Anew = (real*) malloc(nx*ny*sizeof(real));\n",
-    "real* restrict const rhs  = (real*) malloc(nx*ny*sizeof(real));\n",
-    "```\n",
-    "\n",
-    "\n",
-    "Apply domain decomposition\n",
-    "```C\n",
-    "// Ensure correctness if ny%size != 0\n",
-    "int chunk_size = ceil( (1.0*ny)/size );\n",
-    "\n",
-    "int iy_start = rank * chunk_size;\n",
-    "int iy_end   = iy_start + chunk_size;\n",
-    "\n",
-    "// Do not process boundaries\n",
-    "iy_start = max( iy_start, 1 );\n",
-    "iy_end = min( iy_end, ny - 1 );\n",
-    "```\n",
-    "\n",
-    "Exchange data\n",
-    "```C\n",
-    "//Periodic boundary conditions\n",
-    "int top    = (rank == 0) ? (size-1) : rank-1;\n",
-    "int bottom = (rank == (size-1)) ? 0 : rank+1;\n",
-    "#pragma acc host_data use_device( A )\n",
-    "{\n",
-    "    double start_mpi = MPI_Wtime();\n",
-    "    //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom\n",
-    "    MPI_Sendrecv( A+iy_start*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,\n",
-    "                  A+iy_end*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,\n",
-    "                  MPI_COMM_WORLD, MPI_STATUS_IGNORE );\n",
-    "\n",
-    "    //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top\n",
-    "    MPI_Sendrecv( A+(iy_end-1)*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,\n",
-    "                  A+(iy_start-1)*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,\n",
-    "                  MPI_COMM_WORLD, MPI_STATUS_IGNORE );\n",
-    "    mpi_time += MPI_Wtime() - start_mpi;\n",
-    "}\n",
-    "```\n",
-    "\n",
-    "#### Code\n",
-    "\n",
-    "* [C Version](/edit/C/task2/poisson2d.solution.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task2/poisson2d.solution.F03)\n",
-    "\n",
-    "#### File browser\n",
-    "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task2/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task2/)"
+    "checkdir('task5')\n",
+    "!make poisson2d"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "#### Compiling, Running and Profiling\n",
+    "#### Running\n",
     "\n",
-    "You can compile, run and profile the solution with the next cells.  __After__ the profiling finished the output file `poisson2d.solution.pgprof`  can be downloaded from here: [C Version](/tree/C/task2/pgprof.poisson2d.Task2.solution.tar.gz?download=1) / [Fortran Version](/tree/FORTRAN/task2/pgprof.poisson2d.Task2.solution.tar.gz?download=1).    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%cd $basedir/task2"
+    "For the Multi-GPU version you can set the number of GPUs / MPI ranks using the variable `NP`. On _Ascent_ within a single node you can use up to 6 GPUs."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
-    "checkdir('task2')\n",
-    "!make poisson2d.solution"
+    "checkdir('task5')\n",
+    "!NP=2 make run"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "checkdir('task2')\n",
-    "!NP=2 make run.solution"
+    "#### Scaling\n",
+    "\n",
+    "You can do a simple scaling run for up to all 6 GPUs in the node by executing the next cell."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
-    "checkdir('task2')\n",
-    "!NP=2 make profile.solution"
+    "checkdir('task5')\n",
+    "!NP=1 make run | grep speedup > scale.out\n",
+    "!NP=2 make run | grep speedup >> scale.out\n",
+    "!NP=4 make run | grep speedup >>  scale.out\n",
+    "!NP=6 make run | grep speedup >>  scale.out\n",
+    "data_frame5 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
+    "\n",
+    "!rm scale.out\n",
+    "\n",
+    "data_frame5b=data_frame5.iloc[:,[5,7,10,12]].copy()\n",
+    "data_frame5b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "#### Scaling\n",
+    "#### Profiling\n",
     "\n",
-    "You can do a simple scaling run for up to all 6 GPUs in the node by executing the next cell."
+    "You can profile the code by executing the next cell. __After__ the profiling completed download the tarball containing the profiles (`pgprof.Task5.poisson2d.tar.gz`) with the File Browser. \n",
+    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu. Remember to use the _Multiple processes_ option in the assistant.   \n",
+    "    "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
-    "checkdir('task2')\n",
-    "!NP=1 make run.solution | grep speedup > scale.out\n",
-    "!NP=2 make run.solution | grep speedup >> scale.out\n",
-    "!NP=4 make run.solution | grep speedup >>  scale.out\n",
-    "!NP=6 make run.solution | grep speedup >>  scale.out\n",
-    "data_frameS2 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
-    "\n",
-    "!rm scale.out\n",
-    "\n",
-    "data_frameS2b=data_frameS2.iloc[:,[5,7,10,12]].copy()\n",
-    "data_frameS2b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})\n"
+    "checkdir('task5')\n",
+    "!NP=2 make profile"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "[Back to Top](#top)\n",
-    "\n",
-    "---"
+    "#### References\n",
+    "1. http://www.openacc.org\n",
+    "2. [OpenACC Reference Card](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
+    "3. [OpenSHMEM 1.4 Specification](http://www.openshmem.org/site/sites/default/site_files/OpenSHMEM-1.4.pdf)\n",
+    "4. [NVSHMEM 0.3 EA Developer Guide](./NVSHMEM-Developer-Guide.pdf)"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "## Solution 3:<a name=\"solution3\"></a>\n",
-    "\n",
-    "\n",
-    "Update the boundaries first.\n",
-    "```C\n",
-    "#pragma acc parallel loop present(A,Anew)\n",
-    "for( int ix = ix_start; ix < ix_end; ix++ )\n",
-    "{\n",
-    "    A[(iy_start)*nx+ix] = Anew[(iy_start)*nx+ix];\n",
-    "    A[(iy_end-1)*nx+ix] = Anew[(iy_end-1)*nx+ix];\n",
-    "}\n",
-    "```\n",
-    "\n",
-    "Start the interior loop asynchronously so it can overlap with the MPI communication and wait at the end for the completion.\n",
-    "```C\n",
-    "#pragma acc parallel loop present(A,Anew) async\n",
-    "for (int iy = iy_start+1; iy < iy_end-1; iy++)\n",
-    "{\n",
-    "    for( int ix = ix_start; ix < ix_end; ix++ )\n",
-    "    {\n",
-    "        A[iy*nx+ix] = Anew[iy*nx+ix];\n",
-    "    }\n",
-    "}\n",
-    "\n",
-    "//Periodic boundary conditions\n",
-    "int top    = (rank == 0) ? (size-1) : rank-1;\n",
-    "int bottom = (rank == (size-1)) ? 0 : rank+1;\n",
-    "#pragma acc host_data use_device( A )\n",
-    "{\n",
-    "    double start_mpi = MPI_Wtime();\n",
-    "    //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom\n",
-    "    MPI_Sendrecv( A+iy_start*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,\n",
-    "                  A+iy_end*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,\n",
-    "                  MPI_COMM_WORLD, MPI_STATUS_IGNORE );\n",
-    "\n",
-    "    //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top\n",
-    "    MPI_Sendrecv( A+(iy_end-1)*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,\n",
-    "                  A+(iy_start-1)*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,\n",
-    "                  MPI_COMM_WORLD, MPI_STATUS_IGNORE );\n",
-    "    mpi_time += MPI_Wtime() - start_mpi;\n",
-    "}\n",
-    "#pragma acc wait\n",
-    "```\n",
+    "## Task 6: <a name=\"task5\"></a>Use direct load/store to remote memory\n",
+    "\n",
+    "NVSHMEM allows you to put communications in the GPU kernels. Howerver, the `nvhsmem_put / nvshmem_get` calls are not easily avilable in OpenACC kernels. However, for *intranode* communication when all GPUs can use P2P (as in the nodes in Ascent and Summit) you can get a pointer to a remote GPUs memory using `nvshmem_ptr`.\n",
+    "\n",
+    "To do this you need to:\n",
+    "* use the `nvshmem_ptr` to get pointers to your neighboring (top/bottom) `d_A` allocation\n",
+    "* when setting `A` to `Anew` also update the halos of your neighbors. You need to use the `deviceptr` keyword to use `d_Atop / d_Abottom` device pointers in an OpenACC directly.\n",
+    "* add the needed `nvhsmem_barrier`.\n",
+    "\n",
+    "* Additional task: Similar to the previous version you can use asynchronous execution here.\n",
+    "\n",
+    "_Look for_ __TODOs__.\n",
     "\n",
+    "Compare the scaling and efficiency with the results from the previous tasks and the MPI versions. Check for asynchronous execution in the profiler.\n",
     "\n",
     "\n",
     "#### Code\n",
     "\n",
-    "* [C Version](/edit/C/task3/poisson2d.solution.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task3/poisson2d.solution.F03)\n",
+    "* [C Version](./C/task6/poisson2d.c)\n",
     "\n",
-    "#### File browser\n",
     "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task3/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task3/)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Compiling, Running and Profiling\n",
     "\n",
-    "You can compile, run and profile the solution with the next cells.  __After__ the profiling finished the output file `poisson2d.solution.pgprof`  can be downloaded from here: [C Version](/tree/C/task3/pgprof.poisson2d.Task3.solution.tar.gz?download=1) / [Fortran Version](/tree/FORTRAN/task3/pgprof.poisson2d.Task3.solution.tar.gz?download=1).    "
+    "__Before__ executing any of the cells below first execute the next cell to change to the right directory."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
-    "%cd $basedir/task3"
+    "%cd $basedirC/task6"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
-    "checkdir('task3')\n",
-    "!make poisson2d.solution"
+    "#### Compilation\n",
+    "\n",
+    "If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call `make` with the desired [target](#make).\n",
+    "Alternatively you can just navigate to the right directory and execute `make <target>` in your terminal."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
-    "checkdir('task3')\n",
-    "!NP=2 make run.solution"
+    "checkdir('task6')\n",
+    "!make poisson2d"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "task"
+   },
+   "source": [
+    "#### Running\n",
+    "\n",
+    "For the Multi-GPU version you can set the number of GPUs / MPI ranks using the variable `NP`. On _Ascent_ within a single node you can use up to 6 GPUs."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
-    "checkdir('task3')\n",
-    "!NP=2 make profile.solution"
+    "checkdir('task6')\n",
+    "!NP=2 make run"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "source": [
     "#### Scaling\n",
     "\n",
@@ -1181,20 +1203,60 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "exercise": "task"
+   },
    "outputs": [],
    "source": [
-    "checkdir('task3')\n",
-    "!NP=1 make run.solution | grep speedup > scale.out\n",
-    "!NP=2 make run.solution | grep speedup >> scale.out\n",
-    "!NP=4 make run.solution | grep speedup >>  scale.out\n",
-    "!NP=6 make run.solution | grep speedup >>  scale.out\n",
-    "data_frameS3 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
+    "checkdir('task6')\n",
+    "!NP=1 make run | grep speedup > scale.out\n",
+    "!NP=2 make run | grep speedup >> scale.out\n",
+    "!NP=4 make run | grep speedup >>  scale.out\n",
+    "!NP=6 make run | grep speedup >>  scale.out\n",
+    "data_frame5 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
     "\n",
     "!rm scale.out\n",
     "\n",
-    "data_frameS3b=data_frameS3.iloc[:,[5,7,10,12]].copy()\n",
-    "data_frameS3b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})"
+    "data_frame5b=data_frame5.iloc[:,[5,7,10,12]].copy()\n",
+    "data_frame5b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "task"
+   },
+   "source": [
+    "#### Profiling\n",
+    "\n",
+    "You can profile the code by executing the next cell. __After__ the profiling completed download the tarball containing the profiles (`pgprof.Task6.poisson2d.tar.gz`) with the File Browser. \n",
+    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu. Remember to use the _Multiple processes_ option in the assistant.    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "exercise": "task"
+   },
+   "outputs": [],
+   "source": [
+    "checkdir('task6')\n",
+    "!NP=2 make profile"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "task"
+   },
+   "source": [
+    "#### References\n",
+    "1. http://www.openacc.org\n",
+    "2. [OpenACC Reference Card](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
+    "3. [OpenSHMEM 1.4 Specification](http://www.openshmem.org/site/sites/default/site_files/OpenSHMEM-1.4.pdf)\n",
+    "4. [NVSHMEM 0.3 EA Developer Guide](./NVSHMEM-Developer-Guide.pdf)"
    ]
   },
   {
@@ -1209,13 +1271,18 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "---\n",
+    "\n",
     "# Survey<a name=\"survey\"></a>\n",
     "\n",
-    "Please rememeber to take some time and fill out the [survey](http://bit.ly/sc18-eval)."
+    "Please remember to take some time and fill out the surveyhttp://bit.ly/sc19-eval.\n",
+    "\n",
+    "![eval.png](./resources/eval.png)"
    ]
   }
  ],
  "metadata": {
+  "celltoolbar": "Edit Metadata",
   "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
@@ -1231,7 +1298,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.0"
+   "version": "3.7.5"
   },
   "toc": {
    "base_numbering": 1,
@@ -1248,5 +1315,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/4-GPU/HandsOn/HandsOnGPUProgramming_Task.pdf b/4-GPU/HandsOn/HandsOnGPUProgramming_Task.pdf
deleted file mode 100644
index fa5b7f8bbc562c0da982f8d9904b9276fc0d44d0..0000000000000000000000000000000000000000
Binary files a/4-GPU/HandsOn/HandsOnGPUProgramming_Task.pdf and /dev/null differ
diff --git a/4-GPU/HandsOn/Solution/C/.clang-format b/4-GPU/HandsOn/Solution/C/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..c38075d9b581125672afffab80d5b7bed31baa7a
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/.clang-format
@@ -0,0 +1,148 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:   
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: true
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:   
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Preserve
+IncludeCategories: 
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats: 
+  - Language:        Cpp
+    Delimiters:      
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+  - Language:        TextProto
+    Delimiters:      
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions: 
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Auto
+TabWidth:        8
+UseTab:          Never
+...
+
diff --git a/4-GPU/HandsOn/Solution/C/task0/Makefile b/4-GPU/HandsOn/Solution/C/task0/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..e3265d91d53c6ff782546d7ba7011e06ba661617
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task0/Makefile
@@ -0,0 +1,49 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = pgcc
+CFLAGS = -DUSE_DOUBLE
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI)
+	CFLAGS += -fast
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,managed
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+
+TASK=0
+NP ?= 1
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.c poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.solution.pgprof poisson2d.pgprof
+
+run: poisson2d
+	${SC19_SUBMIT_CMD} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD} ${PGPROF}  -f -o ${SC19_DIR_SCRATCH}/poisson2d.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.pgprof .
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD} ${PGPROF}  -o ${SC19_DIR_SCRATCH}/poisson2d.solution.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.pgprof .
\ No newline at end of file
diff --git a/4-GPU/HandsOn/Solution/C/task0/common.h b/4-GPU/HandsOn/Solution/C/task0/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1d1efa0c24854c242049dba3633d0e4001a09fd
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task0/common.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+typedef struct
+{
+    int y;
+    int x;
+} dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+double get_time();
+
+void poisson2d_serial( int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/Solution/C/task0/poisson2d.c b/4-GPU/HandsOn/Solution/C/task0/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..e0eed22fa624796c68fc23f3285db5b14ab0ab86
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task0/poisson2d.c
@@ -0,0 +1,137 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+int main(int argc, char** argv) {
+    int ny = 2048;
+    int nx = 2048;
+    int iter_max = 500;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    printf("Calculate reference solution and time serial CPU execution.\n");
+    double start = get_time();
+    poisson2d_serial(iter_max, tol, Aref, Anew, nx, ny, rhs);
+    double runtime_cpu = get_time() - start;
+
+    printf("GPU execution.\n");
+    start = get_time();
+    int iter = 0;
+    real error = 1.0;
+
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+        // TODO: Parallelize loop nest with OpenACC
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        // TODO: Parallelize loop nest with OpenACC
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        // TODO: Parallelize loop nest with OpenACC
+        for (int ix = ix_start; ix < ix_end; ix++) {
+            A[0 * nx + ix] = A[(ny - 2) * nx + ix];
+            A[(ny - 1) * nx + ix] = A[1 * nx + ix];
+        }
+        // TODO: Parallelize loop nest with OpenACC
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+    double runtime = get_time() - start;
+
+    int errors = 0;
+    if (check_results(ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        printf("%dx%d: 1 CPU: %8.4f s, 1 GPU: %8.4f s, speedup: %8.2f\n", ny, nx, runtime_cpu,
+               runtime, runtime_cpu / runtime);
+    } else {
+        errors = -1;
+    }
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+
+    return errors;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task0/poisson2d.solution.c b/4-GPU/HandsOn/Solution/C/task0/poisson2d.solution.c
index 15a4091ae1a52eec8494977658eb49238f40e444..5109184c399166a0dd389f1f9c78be7495712bb5 100644
--- a/4-GPU/HandsOn/Solution/C/task0/poisson2d.solution.c
+++ b/4-GPU/HandsOn/Solution/C/task0/poisson2d.solution.c
@@ -26,122 +26,108 @@
  */
 
 #include <math.h>
-#include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "common.h"
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
     int ny = 2048;
     int nx = 2048;
     int iter_max = 500;
     const real tol = 1.0e-5;
 
-    if (argc == 2)
-    {
-        iter_max = atoi( argv[1] );
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
     }
 
-    real* restrict const A    = (real*) malloc(nx*ny*sizeof(real));
-    real* restrict const Aref = (real*) malloc(nx*ny*sizeof(real));
-    real* restrict const Anew = (real*) malloc(nx*ny*sizeof(real));
-    real* restrict const rhs  = (real*) malloc(nx*ny*sizeof(real));
-    
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
     // set rhs
-    for (int iy = 1; iy < ny-1; iy++)
-    {
-        for( int ix = 1; ix < nx-1; ix++ )
-        {
-            const real x = -1.0 + (2.0*ix/(nx-1));
-            const real y = -1.0 + (2.0*iy/(ny-1));
-            rhs[iy*nx+ix] = expr(-10.0*(x*x + y*y));
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
         }
     }
 
     int ix_start = 1;
-    int ix_end   = (nx - 1);
+    int ix_end = (nx - 1);
 
     int iy_start = 1;
-    int iy_end   = (ny - 1);
-
-    //OpenACC Warm-up
-    #pragma acc parallel loop
-    for( int iy = 0; iy < ny; iy++)
-    {
-        for( int ix = 0; ix < nx; ix++ )
-        {
-            A[iy*nx+ix] = 0.0;
+    int iy_end = (ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            A[iy * nx + ix] = 0.0;
         }
     }
-    
+
     printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
-    
+
     printf("Calculate reference solution and time serial CPU execution.\n");
     double start = get_time();
-    poisson2d_serial( iter_max, tol, Aref, Anew, nx, ny, rhs );
+    poisson2d_serial(iter_max, tol, Aref, Anew, nx, ny, rhs);
     double runtime_cpu = get_time() - start;
 
     printf("GPU execution.\n");
     start = get_time();
-    int iter  = 0;
+    int iter = 0;
     real error = 1.0;
-    
-    while ( error > tol && iter < iter_max )
-    {
+
+    while (error > tol && iter < iter_max) {
         error = 0.0;
 
-        #pragma acc parallel loop
-        for (int iy = iy_start; iy < iy_end; iy++)
-        {
-            for( int ix = ix_start; ix < ix_end; ix++ )
-            {
-                Anew[iy*nx+ix] = -0.25 * (rhs[iy*nx+ix] - ( A[iy*nx+ix+1] + A[iy*nx+ix-1]
-                                                       + A[(iy-1)*nx+ix] + A[(iy+1)*nx+ix] ));
-                error = fmaxr( error, fabsr(Anew[iy*nx+ix]-A[iy*nx+ix]));
+#pragma acc parallel loop
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
             }
         }
-        
-        #pragma acc parallel loop
-        for (int iy = iy_start; iy < iy_end; iy++)
-        {
-            for( int ix = ix_start; ix < ix_end; ix++ )
-            {
-                A[iy*nx+ix] = Anew[iy*nx+ix];
+
+#pragma acc parallel loop
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
             }
         }
 
-        //Periodic boundary conditions
-        #pragma acc parallel loop
-        for (int ix = ix_start; ix < ix_end; ix++)
-        {
-                A[0*nx+ix]      = A[(ny-2)*nx+ix];
-                A[(ny-1)*nx+ix] = A[1*nx+ix];
+// Periodic boundary conditions
+#pragma acc parallel loop
+        for (int ix = ix_start; ix < ix_end; ix++) {
+            A[0 * nx + ix] = A[(ny - 2) * nx + ix];
+            A[(ny - 1) * nx + ix] = A[1 * nx + ix];
         }
-        #pragma acc parallel loop
-        for (int iy = iy_start; iy < iy_end; iy++)
-        {
-                A[iy*nx+0]      = A[iy*nx+(nx-2)];
-                A[iy*nx+(nx-1)] = A[iy*nx+1];
+#pragma acc parallel loop
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
         }
-        
-        if((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
-        
+
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
         iter++;
     }
     double runtime = get_time() - start;
 
     int errors = 0;
-    if (check_results( ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx ))
-    {
-        printf( "%dx%d: 1 CPU: %8.4f s, 1 GPU: %8.4f s, speedup: %8.2f\n", ny,nx, runtime_cpu, runtime, runtime_cpu/runtime );
-    }
-    else
-    {
+    if (check_results(ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        printf("%dx%d: 1 CPU: %8.4f s, 1 GPU: %8.4f s, speedup: %8.2f\n", ny, nx, runtime_cpu,
+               runtime, runtime_cpu / runtime);
+    } else {
         errors = -1;
     }
-    
+
     free(rhs);
     free(Anew);
     free(Aref);
diff --git a/4-GPU/HandsOn/Solution/C/task0/poisson2d.solution.pgprof b/4-GPU/HandsOn/Solution/C/task0/poisson2d.solution.pgprof
index 1b65eef99fac652b77381f47c93f1b862b2516c6..ab8fa4258e5b25c4c7664f1af6b62600d9409540 100644
Binary files a/4-GPU/HandsOn/Solution/C/task0/poisson2d.solution.pgprof and b/4-GPU/HandsOn/Solution/C/task0/poisson2d.solution.pgprof differ
diff --git a/4-GPU/HandsOn/Solution/C/task0/poisson2d_serial.c b/4-GPU/HandsOn/Solution/C/task0/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..397097da0150e29eb9efa5b598a1fea57009435b
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task0/poisson2d_serial.c
@@ -0,0 +1,92 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "common.h"
+
+double get_time() {
+    struct timeval tv;
+    struct timezone tz;
+    gettimeofday(&tv, &tz);
+    return 1.0 * tv.tv_sec + 1.0E-6 * tv.tv_usec;
+}
+
+void poisson2d_serial(int iter_max, real tol, real* restrict const Aref, real* restrict const Anew,
+                      int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+        for (int iy = 1; iy < ny - 1; iy++) {
+            for (int ix = 1; ix < nx - 1; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 *
+                    (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                          Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+            }
+        }
+
+        for (int iy = 1; iy < ny - 1; iy++) {
+            for (int ix = 1; ix < nx - 1; ix++) {
+                Aref[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        for (int ix = 1; ix < nx - 1; ix++) {
+            Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+            Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+        }
+        for (int iy = 1; iy < ny - 1; iy++) {
+            Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+            Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+        }
+
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+}
+
+int check_results(int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "ERROR: A[%d][%d] = %f does not match %f (reference)\n", iy, ix,
+                        A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task1/Makefile b/4-GPU/HandsOn/Solution/C/task1/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..ba7aed9c3758168db58f7fe9213c5fa847acc4c2
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task1/Makefile
@@ -0,0 +1,58 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = pgcc
+CFLAGS = -DUSE_DOUBLE
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI)
+	CFLAGS += -fast
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,managed,lineinfo
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+
+TASK=1
+NP ?= 1
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.c poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.solution.*.pgprof poisson2d.*.pgprof *.tar.gz *.csv
+
+run: poisson2d
+	${SC19_SUBMIT_CMD} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.timeline.pgprof ./poisson2d 3
+	${SC19_SUBMIT_CMD} ${PGPROF} --analysis-metrics -o ${SC19_DIR_SCRATCH}/poisson2d.metrics.pgprof ./poisson2d 3
+	${SC19_SUBMIT_CMD} ${PGPROF}  --metrics gld_efficiency,gst_efficiency -o ${SC19_DIR_SCRATCH}/poisson2d.efficiency.pgprof ./poisson2d 3
+	pgprof --csv -i ${SC19_DIR_SCRATCH}/poisson2d.efficiency.pgprof 2>&1 | grep -v "======" > poisson2d.efficiency.csv
+	mv ${SC19_DIR_SCRATCH}/poisson2d.*.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.timeline.pgprof poisson2d.metrics.pgprof
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+		${SC19_SUBMIT_CMD} ${PGPROF}  -o ${SC19_DIR_SCRATCH}/poisson2d.solution.timeline.pgprof ./poisson2d.solution 3
+		${SC19_SUBMIT_CMD} ${PGPROF}  --analysis-metrics -o ${SC19_DIR_SCRATCH}/poisson2d.solution.metrics.pgprof ./poisson2d.solution 3
+		${SC19_SUBMIT_CMD} ${PGPROF}  --metrics gld_efficiency,gst_efficiency -o ${SC19_DIR_SCRATCH}/poisson2d.solution.efficiency.pgprof ./poisson2d.solution 3
+		pgprof --csv -i ${SC19_DIR_SCRATCH}/poisson2d.solution.efficiency.pgprof 2>&1 | grep -v "======" > poisson2d.solution.efficiency.csv
+		mv ${SC19_DIR_SCRATCH}/poisson2d.solution.*.pgprof .
+		tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz  poisson2d.solution.*.pgprof
+
diff --git a/4-GPU/HandsOn/Solution/C/task1/common.h b/4-GPU/HandsOn/Solution/C/task1/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1d1efa0c24854c242049dba3633d0e4001a09fd
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task1/common.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+typedef struct
+{
+    int y;
+    int x;
+} dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+double get_time();
+
+void poisson2d_serial( int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/Solution/C/task1/pgprof.poisson2d.Task1.solution.tar.gz b/4-GPU/HandsOn/Solution/C/task1/pgprof.poisson2d.Task1.solution.tar.gz
index 38e40dae716b4f1bdbe3645a18ae7fc252c92f65..32c823991a56f5f061282dbccb09f2e2d850a89d 100644
Binary files a/4-GPU/HandsOn/Solution/C/task1/pgprof.poisson2d.Task1.solution.tar.gz and b/4-GPU/HandsOn/Solution/C/task1/pgprof.poisson2d.Task1.solution.tar.gz differ
diff --git a/4-GPU/HandsOn/Solution/C/task1/poisson2d.c b/4-GPU/HandsOn/Solution/C/task1/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..055b318854c0b29a2bb89c00f19e795dda30dae1
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task1/poisson2d.c
@@ -0,0 +1,139 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+int main(int argc, char** argv) {
+    int ny = 2048;
+    int nx = 2048;
+    int iter_max = 500;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    printf("Calculate reference solution and time serial CPU execution.\n");
+    double start = get_time();
+    poisson2d_serial(iter_max, tol, Aref, Anew, nx, ny, rhs);
+    double runtime_cpu = get_time() - start;
+
+    printf("GPU execution.\n");
+    start = get_time();
+    int iter = 0;
+    real error = 1.0;
+
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+#pragma acc parallel loop
+        for (int ix = ix_start; ix < ix_end; ix++) {
+#pragma acc loop
+            for (int iy = iy_start; iy < iy_end; iy++) {
+                // TODO: Fix memory access pattern
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+#pragma acc parallel loop
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+// Periodic boundary conditions
+#pragma acc parallel loop
+        for (int ix = ix_start; ix < ix_end; ix++) {
+            A[0 * nx + ix] = A[(ny - 2) * nx + ix];
+            A[(ny - 1) * nx + ix] = A[1 * nx + ix];
+        }
+#pragma acc parallel loop
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+    double runtime = get_time() - start;
+
+    int errors = 0;
+    if (check_results(ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        printf("%dx%d: 1 CPU: %8.4f s, 1 GPU: %8.4f s, speedup: %8.2f\n", ny, nx, runtime_cpu,
+               runtime, runtime_cpu / runtime);
+    } else {
+        errors = -1;
+    }
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+
+    return errors;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task1/poisson2d.solution.c b/4-GPU/HandsOn/Solution/C/task1/poisson2d.solution.c
index 2542f676a20f94f930285b5e846c35046c4ccc72..5109184c399166a0dd389f1f9c78be7495712bb5 100644
--- a/4-GPU/HandsOn/Solution/C/task1/poisson2d.solution.c
+++ b/4-GPU/HandsOn/Solution/C/task1/poisson2d.solution.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -26,122 +26,108 @@
  */
 
 #include <math.h>
-#include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "common.h"
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
     int ny = 2048;
     int nx = 2048;
     int iter_max = 500;
     const real tol = 1.0e-5;
 
-    if (argc == 2)
-    {
-        iter_max = atoi( argv[1] );
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
     }
 
-    real* restrict const A    = (real*) malloc(nx*ny*sizeof(real));
-    real* restrict const Aref = (real*) malloc(nx*ny*sizeof(real));
-    real* restrict const Anew = (real*) malloc(nx*ny*sizeof(real));
-    real* restrict const rhs  = (real*) malloc(nx*ny*sizeof(real));
-    
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
     // set rhs
-    for (int iy = 1; iy < ny-1; iy++)
-    {
-        for( int ix = 1; ix < nx-1; ix++ )
-        {
-            const real x = -1.0 + (2.0*ix/(nx-1));
-            const real y = -1.0 + (2.0*iy/(ny-1));
-            rhs[iy*nx+ix] = expr(-10.0*(x*x + y*y));
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
         }
     }
 
     int ix_start = 1;
-    int ix_end   = (nx - 1);
+    int ix_end = (nx - 1);
 
     int iy_start = 1;
-    int iy_end   = (ny - 1);
-
-    //OpenACC Warm-up
-    #pragma acc parallel loop
-    for( int iy = 0; iy < ny; iy++)
-    {
-        for( int ix = 0; ix < nx; ix++ )
-        {
-            A[iy*nx+ix] = 0.0;
+    int iy_end = (ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            A[iy * nx + ix] = 0.0;
         }
     }
-    
+
     printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
-    
+
     printf("Calculate reference solution and time serial CPU execution.\n");
     double start = get_time();
-    poisson2d_serial( iter_max, tol, Aref, Anew, nx, ny, rhs );
+    poisson2d_serial(iter_max, tol, Aref, Anew, nx, ny, rhs);
     double runtime_cpu = get_time() - start;
 
     printf("GPU execution.\n");
     start = get_time();
-    int iter  = 0;
+    int iter = 0;
     real error = 1.0;
-    
-    while ( error > tol && iter < iter_max )
-    {
+
+    while (error > tol && iter < iter_max) {
         error = 0.0;
 
-        #pragma acc parallel loop
-        for (int iy = iy_start; iy < iy_end; iy++)
-        {
-            for( int ix = ix_start; ix < ix_end; ix++ )
-            {
-                Anew[iy*nx+ix] = -0.25 * (rhs[iy*nx+ix] - ( A[iy*nx+ix+1] + A[iy*nx+ix-1]
-                                                       + A[(iy-1)*nx+ix] + A[(iy+1)*nx+ix] ));
-                error = fmaxr( error, fabsr(Anew[iy*nx+ix]-A[iy*nx+ix]));
+#pragma acc parallel loop
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
             }
         }
-        
-        #pragma acc parallel loop
-        for (int iy = iy_start; iy < iy_end; iy++)
-        {
-            for( int ix = ix_start; ix < ix_end; ix++ )
-            {
-                A[iy*nx+ix] = Anew[iy*nx+ix];
+
+#pragma acc parallel loop
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
             }
         }
 
-        //Periodic boundary conditions
-        #pragma acc parallel loop
-        for (int ix = ix_start; ix < ix_end; ix++)
-        {
-                A[0*nx+ix]      = A[(ny-2)*nx+ix];
-                A[(ny-1)*nx+ix] = A[1*nx+ix];
+// Periodic boundary conditions
+#pragma acc parallel loop
+        for (int ix = ix_start; ix < ix_end; ix++) {
+            A[0 * nx + ix] = A[(ny - 2) * nx + ix];
+            A[(ny - 1) * nx + ix] = A[1 * nx + ix];
         }
-        #pragma acc parallel loop
-        for (int iy = iy_start; iy < iy_end; iy++)
-        {
-                A[iy*nx+0]      = A[iy*nx+(nx-2)];
-                A[iy*nx+(nx-1)] = A[iy*nx+1];
+#pragma acc parallel loop
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
         }
-        
-        if((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
-        
+
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
         iter++;
     }
     double runtime = get_time() - start;
 
     int errors = 0;
-    if (check_results( ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx ))
-    {
-        printf( "%dx%d: 1 CPU: %8.4f s, 1 GPU: %8.4f s, speedup: %8.2f\n", ny,nx, runtime_cpu, runtime, runtime_cpu/runtime );
-    }
-    else
-    {
+    if (check_results(ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        printf("%dx%d: 1 CPU: %8.4f s, 1 GPU: %8.4f s, speedup: %8.2f\n", ny, nx, runtime_cpu,
+               runtime, runtime_cpu / runtime);
+    } else {
         errors = -1;
     }
-    
+
     free(rhs);
     free(Anew);
     free(Aref);
diff --git a/4-GPU/HandsOn/Solution/C/task1/poisson2d.solution.efficiency.csv b/4-GPU/HandsOn/Solution/C/task1/poisson2d.solution.efficiency.csv
new file mode 100644
index 0000000000000000000000000000000000000000..e9338b3189c854bc4e8eb2c84343cd261fb193bd
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task1/poisson2d.solution.efficiency.csv
@@ -0,0 +1,13 @@
+"Device","Kernel","Invocations","Metric Name","Metric Description","Min","Max","Avg"
+"Tesla V100-SXM2-16GB (0)","main_98_gpu",3,"gld_efficiency","Global Memory Load Efficiency",90.885714%,91.040204%,90.938132%
+"Tesla V100-SXM2-16GB (0)","main_98_gpu",3,"gst_efficiency","Global Memory Store Efficiency",88.956522%,88.956522%,88.956522%
+"Tesla V100-SXM2-16GB (0)","main_106_gpu",3,"gld_efficiency","Global Memory Load Efficiency",94.722222%,94.722222%,94.722222%
+"Tesla V100-SXM2-16GB (0)","main_106_gpu",3,"gst_efficiency","Global Memory Store Efficiency",88.956522%,88.956522%,88.956522%
+"Tesla V100-SXM2-16GB (0)","main_94_gpu__red",3,"gld_efficiency","Global Memory Load Efficiency",99.756335%,99.756335%,99.756335%
+"Tesla V100-SXM2-16GB (0)","main_94_gpu__red",3,"gst_efficiency","Global Memory Store Efficiency",25.000000%,25.000000%,25.000000%
+"Tesla V100-SXM2-16GB (0)","main_66_gpu",1,"gld_efficiency","Global Memory Load Efficiency",0.000000%,0.000000%,0.000000%
+"Tesla V100-SXM2-16GB (0)","main_66_gpu",1,"gst_efficiency","Global Memory Store Efficiency",100.000000%,100.000000%,100.000000%
+"Tesla V100-SXM2-16GB (0)","main_88_gpu",3,"gld_efficiency","Global Memory Load Efficiency",91.836772%,91.874827%,91.856345%
+"Tesla V100-SXM2-16GB (0)","main_88_gpu",3,"gst_efficiency","Global Memory Store Efficiency",88.845486%,88.845486%,88.845486%
+"Tesla V100-SXM2-16GB (0)","main_111_gpu",3,"gld_efficiency","Global Memory Load Efficiency",25.000000%,25.000000%,25.000000%
+"Tesla V100-SXM2-16GB (0)","main_111_gpu",3,"gst_efficiency","Global Memory Store Efficiency",25.000000%,25.000000%,25.000000%
diff --git a/4-GPU/HandsOn/Solution/C/task1/poisson2d_serial.c b/4-GPU/HandsOn/Solution/C/task1/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..397097da0150e29eb9efa5b598a1fea57009435b
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task1/poisson2d_serial.c
@@ -0,0 +1,92 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "common.h"
+
+double get_time() {
+    struct timeval tv;
+    struct timezone tz;
+    gettimeofday(&tv, &tz);
+    return 1.0 * tv.tv_sec + 1.0E-6 * tv.tv_usec;
+}
+
+void poisson2d_serial(int iter_max, real tol, real* restrict const Aref, real* restrict const Anew,
+                      int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+        for (int iy = 1; iy < ny - 1; iy++) {
+            for (int ix = 1; ix < nx - 1; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 *
+                    (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                          Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+            }
+        }
+
+        for (int iy = 1; iy < ny - 1; iy++) {
+            for (int ix = 1; ix < nx - 1; ix++) {
+                Aref[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        for (int ix = 1; ix < nx - 1; ix++) {
+            Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+            Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+        }
+        for (int iy = 1; iy < ny - 1; iy++) {
+            Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+            Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+        }
+
+        if ((iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+}
+
+int check_results(int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "ERROR: A[%d][%d] = %f does not match %f (reference)\n", iy, ix,
+                        A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task2/Makefile b/4-GPU/HandsOn/Solution/C/task2/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..22eb6567c31e7f2863a4852c84a93522e1c4ac5a
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task2/Makefile
@@ -0,0 +1,50 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = mpicc
+CFLAGS = -DUSE_DOUBLE
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi
+
+TASK=2
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.c poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof
\ No newline at end of file
diff --git a/4-GPU/HandsOn/Solution/C/task2/common.h b/4-GPU/HandsOn/Solution/C/task2/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..06d65fb3272fa9baa8a8f74e3d8208b76c0f19c8
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task2/common.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+typedef struct
+{
+    int y;
+    int x;
+} dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+void poisson2d_serial( int rank, int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int rank, int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/Solution/C/task2/pgprof.poisson2d.Task2.solution.tar.gz b/4-GPU/HandsOn/Solution/C/task2/pgprof.poisson2d.Task2.solution.tar.gz
index 705672f81b7daa5329cd77049457cbd0cddcb07e..d6af762383dfe9861007e784636dc082325b4b07 100644
Binary files a/4-GPU/HandsOn/Solution/C/task2/pgprof.poisson2d.Task2.solution.tar.gz and b/4-GPU/HandsOn/Solution/C/task2/pgprof.poisson2d.Task2.solution.tar.gz differ
diff --git a/4-GPU/HandsOn/Solution/C/task2/poisson2d.c b/4-GPU/HandsOn/Solution/C/task2/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..c3a06bdbad6c34c3db78bf1ffe64d2ded4884fe2
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task2/poisson2d.c
@@ -0,0 +1,223 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <mpi.h>
+
+#include "common.h"
+
+int main(int argc, char** argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    int rank = 0;
+    int size = 1;
+
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // TODO: handle device affinity
+
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    // TODO: set first and last row to be processed by this rank.
+    int iy_start = 1;
+    int iy_end = (ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    // MPI Warm-up to establish CUDA IPC connections
+    for (int i = 0; i < 2; ++i) {
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
+        {
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        }
+    }
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+
+#pragma acc parallel loop present(A, Anew)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+// Periodic boundary conditions
+// TODO: Handle top/bottom periodic boundary conditions and halo exchange with MPI
+#pragma acc parallel loop present(A)
+        for (int ix = 1; ix < nx - 1; ix++) {
+            A[0 * nx + ix] = A[(ny - 2) * nx + ix];
+            A[(ny - 1) * nx + ix] = A[1 * nx + ix];
+        }
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+        // TODO: Pass device ptr of A to MPI using host_data use_device
+        {
+            double start_mpi = MPI_Wtime();
+            // TODO: 1. Sent row iy_start (first modified row) to top receive lower boundary
+            // (iy_end) from bottom
+
+            // MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_REAL_TYPE,
+            // int dest, 0, void *recvbuf, int recvcount, MPI_REAL_TYPE, int source, 0,
+            // MPI_COMM_WORLD, MPI_STATUS_IGNORE );
+
+            // TODO: 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+
+            // MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_REAL_TYPE,
+            // int dest, 0, void *recvbuf, int recvcount, MPI_REAL_TYPE, int source, 0,
+            // MPI_COMM_WORLD, MPI_STATUS_IGNORE );
+            mpi_time += MPI_Wtime() - start_mpi;
+        }
+
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
+        }
+    } else {
+        errors = -1;
+    }
+
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    return errors;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task2/poisson2d.solution.c b/4-GPU/HandsOn/Solution/C/task2/poisson2d.solution.c
index 8e3aae7d8eabc8e1408762f9f3cd29d0f094f88b..4d9b354d941fc752161b375767d6b6eb4fea524c 100644
--- a/4-GPU/HandsOn/Solution/C/task2/poisson2d.solution.c
+++ b/4-GPU/HandsOn/Solution/C/task2/poisson2d.solution.c
@@ -26,201 +26,193 @@
  */
 
 #include <math.h>
-#include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include <mpi.h>
 
 #include "common.h"
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
     int ny = 4096;
     int nx = 4096;
     int iter_max = 1000;
     const real tol = 1.0e-5;
 
-    if (argc == 2)
-    {
-        iter_max = atoi( argv[1] );
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
     }
-    
+
     int rank = 0;
     int size = 1;
 
-    //Initialize MPI and determine rank and size
+    // Initialize MPI and determine rank and size
     MPI_Init(&argc, &argv);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     MPI_Comm_size(MPI_COMM_WORLD, &size);
-        
-    #pragma acc set device_num( rank )
-
-    real* restrict const A    = (real*) malloc(nx*ny*sizeof(real));
-    real* restrict const Aref = (real*) malloc(nx*ny*sizeof(real));
-    real* restrict const Anew = (real*) malloc(nx*ny*sizeof(real));
-    real* restrict const rhs  = (real*) malloc(nx*ny*sizeof(real));
-    
+
+#pragma acc set device_num(rank)
+
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
     // set rhs
-    for (int iy = 1; iy < ny-1; iy++)
-    {
-        for( int ix = 1; ix < nx-1; ix++ )
-        {
-            const real x = -1.0 + (2.0*ix/(nx-1));
-            const real y = -1.0 + (2.0*iy/(ny-1));
-            rhs[iy*nx+ix] = expr(-10.0*(x*x + y*y));
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
         }
     }
 
-    #pragma acc enter data create(A[0:nx*ny],Aref[0:nx*ny],Anew[0:nx*ny],rhs[0:nx*ny])
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
 
     int ix_start = 1;
-    int ix_end   = (nx - 1);
+    int ix_end = (nx - 1);
 
     // Ensure correctness if ny%size != 0
-    int chunk_size = ceil( (1.0*ny)/size );
+    int chunk_size = ceil((1.0 * ny) / size);
 
     int iy_start = rank * chunk_size;
-    int iy_end   = iy_start + chunk_size;
+    int iy_end = iy_start + chunk_size;
 
     // Do not process boundaries
-    iy_start = max( iy_start, 1 );
-    iy_end = min( iy_end, ny - 1 );
-    
-    //OpenACC Warm-up
-    #pragma acc parallel loop present(A,Aref)
-    for( int iy = 0; iy < ny; iy++)
-    {
-        for( int ix = 0; ix < nx; ix++ )
-        {
-            Aref[iy*nx+ix] = 0.0;
-            A[iy*nx+ix] = 0.0;
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
         }
     }
-    
-    //MPI Warm-up to establish CUDA IPC connections
-    for (int i=0; i<2; ++i)
-    {
-        int top    = (rank == 0) ? (size-1) : rank-1;
-        int bottom = (rank == (size-1)) ? 0 : rank+1;
-        #pragma acc host_data use_device( A )
+
+    // MPI Warm-up to establish CUDA IPC connections
+    for (int i = 0; i < 2; ++i) {
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
         {
-            //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom
-            MPI_Sendrecv( A+iy_start*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,
-                          A+iy_end*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,
-                          MPI_COMM_WORLD, MPI_STATUS_IGNORE );
-
-            //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top
-            MPI_Sendrecv( A+(iy_end-1)*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,
-                          A+(iy_start-1)*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,
-                          MPI_COMM_WORLD, MPI_STATUS_IGNORE );
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
         }
     }
-        
-    //Wait for all processes to finish Warm-up
-    MPI_Barrier( MPI_COMM_WORLD );
-    
-    if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
 
     double runtime_serial = 0.0;
-    if ( rank == 0)
-    {
+    if (rank == 0) {
         printf("Calculate reference solution and time serial execution.\n");
-        //Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
         double start = MPI_Wtime();
-        poisson2d_serial( rank, iter_max, tol, Aref, Anew, nx, ny, rhs );
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
         runtime_serial = MPI_Wtime() - start;
     }
-    MPI_Bcast(Aref, nx*ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
 
-    //Wait for all processes to ensure correct timing of the parallel version
-    MPI_Barrier( MPI_COMM_WORLD );
-    if ( rank == 0) printf("Parallel execution.\n");
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
     double mpi_time = 0.0;
     double start = MPI_Wtime();
-    int iter  = 0;
+    int iter = 0;
     real error = 1.0;
-    
-    #pragma acc update device(A[(iy_start-1)*nx:((iy_end-iy_start)+2)*nx],rhs[iy_start*nx:(iy_end-iy_start)*nx])
-    while ( error > tol && iter < iter_max )
-    {
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
         error = 0.0;
 
-        #pragma acc parallel loop present(A,Anew,rhs)
-        for (int iy = iy_start; iy < iy_end; iy++)
-        {
-            for( int ix = ix_start; ix < ix_end; ix++ )
-            {
-                Anew[iy*nx+ix] = -0.25 * (rhs[iy*nx+ix] - ( A[iy*nx+ix+1] + A[iy*nx+ix-1]
-                                                       + A[(iy-1)*nx+ix] + A[(iy+1)*nx+ix] ));
-                error = fmaxr( error, fabsr(Anew[iy*nx+ix]-A[iy*nx+ix]));
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
             }
         }
-        
+
         real globalerror = 0.0;
-        MPI_Allreduce( &error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD );
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
         error = globalerror;
-        
-        #pragma acc parallel loop present(A,Anew)
-        for (int iy = iy_start; iy < iy_end; iy++)
-        {
-            for( int ix = ix_start; ix < ix_end; ix++ )
-            {
-                A[iy*nx+ix] = Anew[iy*nx+ix];
+
+#pragma acc parallel loop present(A, Anew)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
             }
         }
 
-        //Periodic boundary conditions
-        int top    = (rank == 0) ? (size-1) : rank-1;
-        int bottom = (rank == (size-1)) ? 0 : rank+1;
-        #pragma acc host_data use_device( A )
+        // Periodic boundary conditions
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
         {
             double start_mpi = MPI_Wtime();
-            //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom
-            MPI_Sendrecv( A+iy_start*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,
-                          A+iy_end*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,
-                          MPI_COMM_WORLD, MPI_STATUS_IGNORE );
-
-            //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top
-            MPI_Sendrecv( A+(iy_end-1)*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,
-                          A+(iy_start-1)*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,
-                          MPI_COMM_WORLD, MPI_STATUS_IGNORE );
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
             mpi_time += MPI_Wtime() - start_mpi;
         }
 
-        #pragma acc parallel loop present(A)
-        for (int iy = iy_start; iy < iy_end; iy++)
-        {
-                A[iy*nx+0]      = A[iy*nx+(nx-2)];
-                A[iy*nx+(nx-1)] = A[iy*nx+1];
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
         }
-        
-        if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
-        
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
         iter++;
     }
-    #pragma acc update self(A[(iy_start-1)*nx:((iy_end-iy_start)+2)*nx])
-    MPI_Barrier( MPI_COMM_WORLD );
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
     double runtime = MPI_Wtime() - start;
 
     int errors = 0;
-    if (check_results( rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx ))
-    {
-        if ( rank == 0 )
-        {
-            printf( "Num GPUs: %d.\n", size );
-            printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", ny,nx, runtime_serial, size, runtime, runtime_serial/runtime, runtime_serial/(size*runtime)*100 );
-            printf( "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time, (iter*4*(ix_end-ix_start)*sizeof(real))/(1024*1024*1024*mpi_time) );
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
         }
-    }
-    else
-    {
+    } else {
         errors = -1;
     }
 
-    #pragma acc exit data delete(A,Aref,Anew,rhs)
+#pragma acc exit data delete (A, Aref, Anew, rhs)
     MPI_Finalize();
-    
+
     free(rhs);
     free(Anew);
     free(Aref);
diff --git a/4-GPU/HandsOn/Solution/C/task2/poisson2d_serial.c b/4-GPU/HandsOn/Solution/C/task2/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..c0229e7530bd3fc021baafe0a744506640b34e67
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task2/poisson2d_serial.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+
+#include "common.h"
+
+void poisson2d_serial(int rank, int iter_max, real tol, real* restrict const Aref,
+                      real* restrict const Anew, int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+#pragma acc data present(Aref, Anew, rhs)
+    {
+#pragma acc update device(Aref [0:nx * ny], rhs [0:nx * ny])
+        while (error > tol && iter < iter_max) {
+            error = 0.0;
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < nx - 1; ix++) {
+                    Anew[iy * nx + ix] =
+                        -0.25 *
+                        (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                              Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                    error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+                }
+            }
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < nx - 1; ix++) {
+                    Aref[iy * nx + ix] = Anew[iy * nx + ix];
+                }
+            }
+
+// Periodic boundary conditions
+#pragma acc parallel loop
+            for (int ix = 1; ix < nx - 1; ix++) {
+                Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+                Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+            }
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+                Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+            }
+
+            if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+            iter++;
+        }
+#pragma acc update self(Aref [0:nx * ny])
+    }
+}
+
+int check_results(int rank, int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "[MPI%d] ERROR: A[%d][%d] = %f does not match %f (reference)\n",
+                        rank, iy, ix, A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+#ifdef MPI_VERSION
+    int global_result_correct = 0;
+    MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    result_correct = global_result_correct;
+#endif  // MPI_VERSION
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task3/Makefile b/4-GPU/HandsOn/Solution/C/task3/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..c0e62d7691a752cb192f680238e105ad17c39c19
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task3/Makefile
@@ -0,0 +1,50 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = mpicc
+CFLAGS = -DUSE_DOUBLE
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi
+
+TASK=3
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.c poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof
\ No newline at end of file
diff --git a/4-GPU/HandsOn/Solution/C/task3/common.h b/4-GPU/HandsOn/Solution/C/task3/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..06d65fb3272fa9baa8a8f74e3d8208b76c0f19c8
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task3/common.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+typedef struct
+{
+    int y;
+    int x;
+} dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+void poisson2d_serial( int rank, int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int rank, int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/Solution/C/task3/pgprof.poisson2d.Task3.solution.tar.gz b/4-GPU/HandsOn/Solution/C/task3/pgprof.poisson2d.Task3.solution.tar.gz
index 77e53b2f17e58da807b1b1a17fcaba99e17cd6ca..4952fa5526d7f2dfb560e547fb495d4e5d38a289 100644
Binary files a/4-GPU/HandsOn/Solution/C/task3/pgprof.poisson2d.Task3.solution.tar.gz and b/4-GPU/HandsOn/Solution/C/task3/pgprof.poisson2d.Task3.solution.tar.gz differ
diff --git a/4-GPU/HandsOn/Solution/C/task3/poisson2d.c b/4-GPU/HandsOn/Solution/C/task3/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..6d62b6a3ca6c5124828687b9d6bf8718bf2c1550
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task3/poisson2d.c
@@ -0,0 +1,224 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <mpi.h>
+
+#include "common.h"
+
+int main(int argc, char** argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    int rank = 0;
+    int size = 1;
+
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+#pragma acc set device_num(rank)
+
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    // Ensure correctness if ny%size != 0
+    int chunk_size = ceil((1.0 * ny) / size);
+
+    int iy_start = rank * chunk_size;
+    int iy_end = iy_start + chunk_size;
+
+    // Do not process boundaries
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    // MPI Warm-up to establish CUDA IPC connections
+    for (int i = 0; i < 2; ++i) {
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
+        {
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        }
+    }
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+
+// TODO: Split into halo and bulk part
+#pragma acc parallel loop present(A, Anew)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+        // TODO: Start bulk part asynchronously
+
+        // Periodic boundary conditions
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
+        {
+            double start_mpi = MPI_Wtime();
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            mpi_time += MPI_Wtime() - start_mpi;
+        }
+        // TODO: wait for bulk part
+
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
+        }
+    } else {
+        errors = -1;
+    }
+
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    return errors;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task3/poisson2d.solution.c b/4-GPU/HandsOn/Solution/C/task3/poisson2d.solution.c
index 717ce2129a5cb195699479562020c1a8a4e6e511..b601cb26289bd62a4c5cc504c681704c9f62de08 100644
--- a/4-GPU/HandsOn/Solution/C/task3/poisson2d.solution.c
+++ b/4-GPU/HandsOn/Solution/C/task3/poisson2d.solution.c
@@ -26,209 +26,200 @@
  */
 
 #include <math.h>
-#include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include <mpi.h>
 
 #include "common.h"
 
-int main(int argc, char** argv)
-{
+int main(int argc, char** argv) {
     int ny = 4096;
     int nx = 4096;
     int iter_max = 1000;
     const real tol = 1.0e-5;
 
-    if (argc == 2)
-    {
-        iter_max = atoi( argv[1] );
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
     }
-    
+
     int rank = 0;
     int size = 1;
 
-    //Initialize MPI and determine rank and size
+    // Initialize MPI and determine rank and size
     MPI_Init(&argc, &argv);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     MPI_Comm_size(MPI_COMM_WORLD, &size);
-        
-    #pragma acc set device_num( rank )
-
-    real* restrict const A    = (real*) malloc(nx*ny*sizeof(real));
-    real* restrict const Aref = (real*) malloc(nx*ny*sizeof(real));
-    real* restrict const Anew = (real*) malloc(nx*ny*sizeof(real));
-    real* restrict const rhs  = (real*) malloc(nx*ny*sizeof(real));
-    
+
+#pragma acc set device_num(rank)
+
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
     // set rhs
-    for (int iy = 1; iy < ny-1; iy++)
-    {
-        for( int ix = 1; ix < nx-1; ix++ )
-        {
-            const real x = -1.0 + (2.0*ix/(nx-1));
-            const real y = -1.0 + (2.0*iy/(ny-1));
-            rhs[iy*nx+ix] = expr(-10.0*(x*x + y*y));
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
         }
     }
 
-    #pragma acc enter data create(A[0:nx*ny],Aref[0:nx*ny],Anew[0:nx*ny],rhs[0:nx*ny])
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
 
     int ix_start = 1;
-    int ix_end   = (nx - 1);
+    int ix_end = (nx - 1);
 
     // Ensure correctness if ny%size != 0
-    int chunk_size = ceil( (1.0*ny)/size );
+    int chunk_size = ceil((1.0 * ny) / size);
 
     int iy_start = rank * chunk_size;
-    int iy_end   = iy_start + chunk_size;
+    int iy_end = iy_start + chunk_size;
 
     // Do not process boundaries
-    iy_start = max( iy_start, 1 );
-    iy_end = min( iy_end, ny - 1 );
-    
-    //OpenACC Warm-up
-    #pragma acc parallel loop present(A,Aref)
-    for( int iy = 0; iy < ny; iy++)
-    {
-        for( int ix = 0; ix < nx; ix++ )
-        {
-            Aref[iy*nx+ix] = 0.0;
-            A[iy*nx+ix] = 0.0;
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
         }
     }
-    
-    //MPI Warm-up to establish CUDA IPC connections
-    for (int i=0; i<2; ++i)
-    {
-        int top    = (rank == 0) ? (size-1) : rank-1;
-        int bottom = (rank == (size-1)) ? 0 : rank+1;
-        #pragma acc host_data use_device( A )
+
+    // MPI Warm-up to establish CUDA IPC connections
+    for (int i = 0; i < 2; ++i) {
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
         {
-            //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom
-            MPI_Sendrecv( A+iy_start*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,
-                          A+iy_end*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,
-                          MPI_COMM_WORLD, MPI_STATUS_IGNORE );
-
-            //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top
-            MPI_Sendrecv( A+(iy_end-1)*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,
-                          A+(iy_start-1)*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,
-                          MPI_COMM_WORLD, MPI_STATUS_IGNORE );
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
         }
     }
-    
-    //Wait for all processes to finish Warm-up
-    MPI_Barrier( MPI_COMM_WORLD );
 
-    if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
 
     double runtime_serial = 0.0;
-    if ( rank == 0)
-    {
+    if (rank == 0) {
         printf("Calculate reference solution and time serial execution.\n");
-        //Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
         double start = MPI_Wtime();
-        poisson2d_serial( rank, iter_max, tol, Aref, Anew, nx, ny, rhs );
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
         runtime_serial = MPI_Wtime() - start;
     }
-    MPI_Bcast(Aref, nx*ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
-    
-    //Wait for all processes to ensure correct timing of the parallel version
-    MPI_Barrier( MPI_COMM_WORLD );
-    if ( rank == 0) printf("Parallel execution.\n");
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
     double mpi_time = 0.0;
     double start = MPI_Wtime();
-    int iter  = 0;
+    int iter = 0;
     real error = 1.0;
-    
-    #pragma acc update device(A[(iy_start-1)*nx:((iy_end-iy_start)+2)*nx],rhs[iy_start*nx:(iy_end-iy_start)*nx])
-    while ( error > tol && iter < iter_max )
-    {
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
         error = 0.0;
 
-        #pragma acc parallel loop present(A,Anew,rhs)
-        for (int iy = iy_start; iy < iy_end; iy++)
-        {
-            for( int ix = ix_start; ix < ix_end; ix++ )
-            {
-                Anew[iy*nx+ix] = -0.25 * (rhs[iy*nx+ix] - ( A[iy*nx+ix+1] + A[iy*nx+ix-1]
-                                                       + A[(iy-1)*nx+ix] + A[(iy+1)*nx+ix] ));
-                error = fmaxr( error, fabsr(Anew[iy*nx+ix]-A[iy*nx+ix]));
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
             }
         }
-        
+
         real globalerror = 0.0;
-        MPI_Allreduce( &error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD );
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
         error = globalerror;
-        
-        #pragma acc parallel loop present(A,Anew)
-        for( int ix = ix_start; ix < ix_end; ix++ )
-        {
-            A[(iy_start)*nx+ix] = Anew[(iy_start)*nx+ix];
-            A[(iy_end-1)*nx+ix] = Anew[(iy_end-1)*nx+ix];
+
+#pragma acc parallel loop present(A, Anew)
+        for (int ix = ix_start; ix < ix_end; ix++) {
+            A[(iy_start)*nx + ix] = Anew[(iy_start)*nx + ix];
+            A[(iy_end - 1) * nx + ix] = Anew[(iy_end - 1) * nx + ix];
         }
-        
-        #pragma acc parallel loop present(A,Anew) async
-        for (int iy = iy_start+1; iy < iy_end-1; iy++)
-        {
-            for( int ix = ix_start; ix < ix_end; ix++ )
-            {
-                A[iy*nx+ix] = Anew[iy*nx+ix];
+
+#pragma acc parallel loop present(A, Anew) async
+        for (int iy = iy_start + 1; iy < iy_end - 1; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
             }
         }
 
-        //Periodic boundary conditions
-        int top    = (rank == 0) ? (size-1) : rank-1;
-        int bottom = (rank == (size-1)) ? 0 : rank+1;
-        #pragma acc host_data use_device( A )
+        // Periodic boundary conditions
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
         {
             double start_mpi = MPI_Wtime();
-            //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom
-            MPI_Sendrecv( A+iy_start*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,
-                          A+iy_end*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,
-                          MPI_COMM_WORLD, MPI_STATUS_IGNORE );
-
-            //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top
-            MPI_Sendrecv( A+(iy_end-1)*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,
-                          A+(iy_start-1)*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,
-                          MPI_COMM_WORLD, MPI_STATUS_IGNORE );
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
             mpi_time += MPI_Wtime() - start_mpi;
         }
-        #pragma acc wait
+#pragma acc wait
 
-        #pragma acc parallel loop present(A)
-        for (int iy = iy_start; iy < iy_end; iy++)
-        {
-                A[iy*nx+0]      = A[iy*nx+(nx-2)];
-                A[iy*nx+(nx-1)] = A[iy*nx+1];
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
         }
-        
-        if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
-        
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
         iter++;
     }
-    #pragma acc update self(A[(iy_start-1)*nx:((iy_end-iy_start)+2)*nx])
-    MPI_Barrier( MPI_COMM_WORLD );
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
     double runtime = MPI_Wtime() - start;
 
     int errors = 0;
-    if (check_results( rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx ))
-    {
-        if ( rank == 0 )
-        {
-            printf( "Num GPUs: %d.\n", size );
-            printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", ny,nx, runtime_serial, size, runtime, runtime_serial/runtime, runtime_serial/(size*runtime)*100 );
-            printf( "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time, (iter*4*(ix_end-ix_start)*sizeof(real))/(1024*1024*1024*mpi_time) );
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
         }
-    }
-    else
-    {
+    } else {
         errors = -1;
     }
 
-    #pragma acc exit data delete(A,Aref,Anew,rhs)
+#pragma acc exit data delete (A, Aref, Anew, rhs)
     MPI_Finalize();
-    
+
     free(rhs);
     free(Anew);
     free(Aref);
diff --git a/4-GPU/HandsOn/Solution/C/task3/poisson2d_serial.c b/4-GPU/HandsOn/Solution/C/task3/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..c0229e7530bd3fc021baafe0a744506640b34e67
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task3/poisson2d_serial.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+
+#include "common.h"
+
+void poisson2d_serial(int rank, int iter_max, real tol, real* restrict const Aref,
+                      real* restrict const Anew, int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+#pragma acc data present(Aref, Anew, rhs)
+    {
+#pragma acc update device(Aref [0:nx * ny], rhs [0:nx * ny])
+        while (error > tol && iter < iter_max) {
+            error = 0.0;
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < nx - 1; ix++) {
+                    Anew[iy * nx + ix] =
+                        -0.25 *
+                        (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                              Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                    error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+                }
+            }
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < nx - 1; ix++) {
+                    Aref[iy * nx + ix] = Anew[iy * nx + ix];
+                }
+            }
+
+// Periodic boundary conditions
+#pragma acc parallel loop
+            for (int ix = 1; ix < nx - 1; ix++) {
+                Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+                Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+            }
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+                Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+            }
+
+            if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+            iter++;
+        }
+#pragma acc update self(Aref [0:nx * ny])
+    }
+}
+
+int check_results(int rank, int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "[MPI%d] ERROR: A[%d][%d] = %f does not match %f (reference)\n",
+                        rank, iy, ix, A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+#ifdef MPI_VERSION
+    int global_result_correct = 0;
+    MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    result_correct = global_result_correct;
+#endif  // MPI_VERSION
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task4/Makefile b/4-GPU/HandsOn/Solution/C/task4/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..5fac788bd850665aaee74b207efc944bc47ae7dc
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task4/Makefile
@@ -0,0 +1,55 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = mpicxx
+CFLAGS = -DUSE_DOUBLE 
+
+#NVSHMEM_HOME=${HOME}/nvshmem-master/build
+NVSHMEM_LIBS= -L${NVSHMEM_HOME}/lib -lnvshmem -Mcuda -lcuda -lrt 
+NVSHMEM_INC = -I${NVSHMEM_HOME}/include
+
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi
+
+TASK=4
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) ${NVSHMEM_INC} poisson2d.c poisson2d_serial.o -o poisson2d ${NVSHMEM_LIBS}
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) ${NVSHMEM_INC} poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution ${NVSHMEM_LIBS}
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof
diff --git a/4-GPU/HandsOn/Solution/C/task4/common.h b/4-GPU/HandsOn/Solution/C/task4/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..381bf32b1432f32332420c8be8ac8d364d8c02bf
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task4/common.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+        typedef struct {
+    int y;
+    int x;
+    } dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+void poisson2d_serial( int rank, int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int rank, int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/Solution/C/task4/pgprof.poisson2d.Task4.solution.tar.gz b/4-GPU/HandsOn/Solution/C/task4/pgprof.poisson2d.Task4.solution.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..4344f056d83b555439d712ddb0f2c29117993e69
Binary files /dev/null and b/4-GPU/HandsOn/Solution/C/task4/pgprof.poisson2d.Task4.solution.tar.gz differ
diff --git a/4-GPU/HandsOn/Solution/C/task4/poisson2d.c b/4-GPU/HandsOn/Solution/C/task4/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..2d11e0fa26c7af5aac4f1fbd33b3d7465f8d9b80
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task4/poisson2d.c
@@ -0,0 +1,245 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <mpi.h>
+#include <openacc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+// TODO: Inlucde necessary headers for NVSHMEM
+
+// Helper function to map existing device allocation to host allocation for NVSHMEM
+void map(real* restrict harr, real* restrict darr, int size) { acc_map_data(harr, darr, size); }
+
+int main(int argc, char** argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    int rank = 0;
+    int size = 1;
+
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // TODO: Initialize NVSHMEM with MPI using nvshmemx_init_attr
+
+#pragma acc set device_num(rank)
+
+    real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
+    real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
+
+    // TODO: Allocate symmetric device memory for A
+    // real *d_A = ...
+
+    // TODO: For OpenACC we need to map it to A and Anew so that OpenACC knows we already allocated
+    // device memory for A and Anew
+    // You can use the helper function map(...) above or us acc_map_data directly
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    // Ensure correctness if ny%size != 0
+    int chunk_size = ceil((1.0 * ny) / size);
+
+    int iy_start = rank * chunk_size;
+    int iy_end = iy_start + chunk_size;
+
+    // Do not process boundaries
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    // TODO: Warming up MPI / CUDA IPC is not needed with NVSHMEM - remove that part
+    // MPI Warm-up to establish CUDA IPC connections
+    for (int i = 0; i < 2; ++i) {
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
+        {
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        }
+    }
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+
+#pragma acc parallel loop present(A, Anew)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+#pragma acc host_data use_device(A)
+        {
+            double start_mpi = MPI_Wtime();
+
+            // TODO: Replace both MPI calls with ons-sided nvshmem_<type>_put
+            // make sure to put data in the right location on the remote side
+
+            // 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
+            // bottom
+            MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
+                         A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
+                         MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+            // (iy_start-1) from top
+            MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+                         bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+                         MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+            // TODO: Add a barrier to make sure data had arrived from remote
+
+            mpi_time += MPI_Wtime() - start_mpi;
+        }
+
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
+        }
+    } else {
+        errors = -1;
+    }
+
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    // TODO: free shmem memory
+
+    return errors;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task4/poisson2d.solution.c b/4-GPU/HandsOn/Solution/C/task4/poisson2d.solution.c
new file mode 100644
index 0000000000000000000000000000000000000000..68a65042b078f5976fa3419697893f956aea4276
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task4/poisson2d.solution.c
@@ -0,0 +1,254 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <mpi.h>
+#include <openacc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+// NVSHMEM
+#include <nvshmem.h>
+#include <nvshmemx.h>
+
+// Helper function to map existing device allocation to host allocation for NVSHMEM
+void map(real *restrict harr, real *restrict darr, int size) { acc_map_data(harr, darr, size); }
+
+int main(int argc, char **argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    int rank = 0;
+    int size = 1;
+
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // NVSHMEM
+    MPI_Comm mpi_comm = MPI_COMM_WORLD;
+    nvshmemx_init_attr_t attr;
+    attr.mpi_comm = &mpi_comm;
+    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
+
+#pragma acc set device_num(rank)
+
+    real *restrict const A = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Aref = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Anew = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const rhs = (real *)malloc(nx * ny * sizeof(real));
+
+    // NVSHMEM
+    real *d_A = (real *)nvshmem_malloc(nx * ny * sizeof(real));
+    map(A, d_A, nx * ny * sizeof(real));
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    // Ensure correctness if ny%size != 0
+    int chunk_size = ceil((1.0 * ny) / size);
+
+    int iy_start = rank * chunk_size;
+    int iy_end = iy_start + chunk_size;
+
+    // Do not process boundaries
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    // Warming up MPI is no longer needed
+
+    //     // MPI Warm-up to establish CUDA IPC connections
+    //     for (int i = 0; i < -1; ++i) {
+    //         int top = (rank == 0) ? (size - 1) : rank - 1;
+    //         int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+    // #pragma acc host_data use_device(A)
+    //         {
+    //             // 1. Sent row iy_start (first modified row) to top receive lower boundary
+    //             (iy_end) from
+    //             // bottom
+    //             MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+    //             top, 0,
+    //                          A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
+    //                          bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+    //             // 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
+    //             // (iy_start-1) from top
+    //             MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start),
+    //             MPI_REAL_TYPE,
+    //                          bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
+    //                          MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    //         }
+    //     }
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+
+#pragma acc parallel loop present(A, Anew)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+        int iy_start_top = top * chunk_size;
+        int iy_end_top = iy_start_top + chunk_size;
+
+        // Do not process boundaries
+        iy_start_top = max(iy_start_top, 1);
+        iy_end_top = min(iy_end_top, ny - 1);
+
+        int iy_start_bottom = bottom * chunk_size;
+        int iy_end_bottom = iy_start_bottom + chunk_size;
+
+        // Do not process boundaries
+        iy_start_bottom = max(iy_start_bottom, 1);
+        iy_end_bottom = min(iy_end_bottom, ny - 1);
+
+        // Halo exchange
+#pragma acc host_data use_device(A)
+        {
+            double start_mpi = MPI_Wtime();
+            nvshmem_double_put((double *)(A + iy_end_top * nx + ix_start),
+                               (double *)(A + iy_start * nx + ix_start), (ix_end - ix_start), top);
+            nvshmem_double_put((double *)(A + (iy_start_bottom - 1) * nx + ix_start),
+                               (double *)(A + (iy_end - 1) * nx + ix_start), (ix_end - ix_start),
+                               bottom);
+            nvshmem_barrier_all();
+            mpi_time += MPI_Wtime() - start_mpi;
+        }
+
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
+        }
+    } else {
+        errors = -1;
+    }
+
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    nvshmem_free(d_A);
+    return errors;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task4/poisson2d_serial.c b/4-GPU/HandsOn/Solution/C/task4/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab9a1f2ca159e1af0361f194c39e7231a1e0aa19
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task4/poisson2d_serial.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+
+#include "common.h"
+
+void poisson2d_serial(int rank, int iter_max, real tol, real* restrict const Aref,
+                      real* restrict const Anew, int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+#pragma acc data present(Aref, Anew, rhs)
+    {
+#pragma acc update device(Aref [0:nx * ny], rhs [0:nx * ny])
+        while (error > tol && iter < iter_max) {
+            error = 0.0;
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < ny - 1; ix++) {
+                    Anew[iy * nx + ix] =
+                        -0.25 *
+                        (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                              Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                    error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+                }
+            }
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < ny - 1; ix++) {
+                    Aref[iy * nx + ix] = Anew[iy * nx + ix];
+                }
+            }
+
+// Periodic boundary conditions
+#pragma acc parallel loop
+            for (int ix = 1; ix < ny - 1; ix++) {
+                Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+                Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+            }
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+                Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+            }
+
+            if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+            iter++;
+        }
+#pragma acc update self(Aref [0:nx * ny])
+    }
+}
+
+int check_results(int rank, int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "[MPI%d] ERROR: A[%d][%d] = %f does not match %f (reference)\n",
+                        rank, iy, ix, A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+#ifdef MPI_VERSION
+    int global_result_correct = 0;
+    MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    result_correct = global_result_correct;
+#endif  // MPI_VERSION
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task5/Makefile b/4-GPU/HandsOn/Solution/C/task5/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..9456192f9c4b4cd765a1553ae4642f656e200192
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task5/Makefile
@@ -0,0 +1,55 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = mpicxx
+CFLAGS = -DUSE_DOUBLE 
+
+NVSHMEM_HOME=${HOME}/nvshmem-master/build
+NVSHMEM_LIBS= -L${NVSHMEM_HOME}/lib -lnvshmem -Mcuda -lcuda -lrt 
+NVSHMEM_INC = -I${NVSHMEM_HOME}/include
+
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi
+
+TASK=5
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) ${NVSHMEM_INC} poisson2d.c poisson2d_serial.o -o poisson2d ${NVSHMEM_LIBS}
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) ${NVSHMEM_INC} poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution ${NVSHMEM_LIBS}
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof
diff --git a/4-GPU/HandsOn/Solution/C/task5/common.h b/4-GPU/HandsOn/Solution/C/task5/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..06d65fb3272fa9baa8a8f74e3d8208b76c0f19c8
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task5/common.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+typedef struct
+{
+    int y;
+    int x;
+} dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+void poisson2d_serial( int rank, int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int rank, int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/Solution/C/task5/pgprof.poisson2d.Task5.solution.tar.gz b/4-GPU/HandsOn/Solution/C/task5/pgprof.poisson2d.Task5.solution.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..d57a92a7579386e22881dae28622048dad870c40
Binary files /dev/null and b/4-GPU/HandsOn/Solution/C/task5/pgprof.poisson2d.Task5.solution.tar.gz differ
diff --git a/4-GPU/HandsOn/Solution/C/task5/poisson2d.c b/4-GPU/HandsOn/Solution/C/task5/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..3d91e14823e6f137a4c8cc84578c359b226625b7
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task5/poisson2d.c
@@ -0,0 +1,238 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <mpi.h>
+#include <openacc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+// NVSHMEM
+#include <nvshmem.h>
+#include <nvshmemx.h>
+
+// Helper function to map existing device allocation to host allocation for NVSHMEM
+void map(real *restrict harr, real *restrict darr, int size) { acc_map_data(harr, darr, size); }
+
+int main(int argc, char **argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    int rank = 0;
+    int size = 1;
+
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // NVSHMEM
+    MPI_Comm mpi_comm = MPI_COMM_WORLD;
+    nvshmemx_init_attr_t attr;
+    attr.mpi_comm = &mpi_comm;
+    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
+
+#pragma acc set device_num(rank)
+
+    real *restrict const A = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Aref = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Anew = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const rhs = (real *)malloc(nx * ny * sizeof(real));
+
+    // NVSHMEM
+    real *d_A = (real *)nvshmem_malloc(nx * ny * sizeof(real));
+    map(A, d_A, nx * ny * sizeof(real));
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    // Ensure correctness if ny%size != 0
+    int chunk_size = ceil((1.0 * ny) / size);
+
+    int iy_start = rank * chunk_size;
+    int iy_end = iy_start + chunk_size;
+
+    // Do not process boundaries
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    
+    //TODO: measuring the MPI time with asynchronous compute is not well defined. Remove it here and belows.
+    double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+//TODO: Check which parts in the while loop can be executed asynchronously by adding the async keyword
+//You might also need to use wait
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+
+#pragma acc parallel loop present(A, Anew)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+        int iy_start_top = top * chunk_size;
+        int iy_end_top = iy_start_top + chunk_size;
+
+        // Do not process boundaries
+        iy_start_top = max(iy_start_top, 1);
+        iy_end_top = min(iy_end_top, ny - 1);
+
+        int iy_start_bottom = bottom * chunk_size;
+        int iy_end_bottom = iy_start_bottom + chunk_size;
+
+        // Do not process boundaries
+        iy_start_bottom = max(iy_start_bottom, 1);
+        iy_end_bottom = min(iy_end_bottom, ny - 1);
+
+        // Halo exchange
+#pragma acc host_data use_device(A)
+        {
+            double start_mpi = MPI_Wtime();
+            // TODO: Get the CUDA stream that corresponds to the OpenACC default async stream
+            // use acc_get_cuda_stream and acc_get_default_async helper functions
+            // replace the nvshmem_double_put calls with nvshmemx_double_put_on_stream versions
+            // and also the same for the barrier
+
+            nvshmem_double_put((double *)(A + iy_end_top * nx + ix_start),
+                               (double *)(A + iy_start * nx + ix_start), (ix_end - ix_start), top);
+            nvshmem_double_put((double *)(A + (iy_start_bottom - 1) * nx + ix_start),
+                               (double *)(A + (iy_end - 1) * nx + ix_start), (ix_end - ix_start),
+                               bottom);
+            nvshmem_barrier_all();
+            mpi_time += MPI_Wtime() - start_mpi;
+        }
+
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
+        }
+    } else {
+        errors = -1;
+    }
+
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    nvshmem_free(d_A);
+    return errors;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task5/poisson2d.solution.c b/4-GPU/HandsOn/Solution/C/task5/poisson2d.solution.c
new file mode 100644
index 0000000000000000000000000000000000000000..97fa97a756d602d226c8375cb81526099a506917
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task5/poisson2d.solution.c
@@ -0,0 +1,229 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <mpi.h>
+#include <openacc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+// NVSHMEM
+#include <nvshmem.h>
+#include <nvshmemx.h>
+
+// Helper function to map existing device allocation to host allocation for NVSHMEM
+void map(real *restrict harr, real *restrict darr, int size) { acc_map_data(harr, darr, size); }
+
+int main(int argc, char **argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    int rank = 0;
+    int size = 1;
+
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // NVSHMEM
+    MPI_Comm mpi_comm = MPI_COMM_WORLD;
+    nvshmemx_init_attr_t attr;
+    attr.mpi_comm = &mpi_comm;
+    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
+
+#pragma acc set device_num(rank)
+
+    real *restrict const A = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Aref = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Anew = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const rhs = (real *)malloc(nx * ny * sizeof(real));
+
+    // NVSHMEM
+    real *d_A = (real *)nvshmem_malloc(nx * ny * sizeof(real));
+    map(A, d_A, nx * ny * sizeof(real));
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    // Ensure correctness if ny%size != 0
+    int chunk_size = ceil((1.0 * ny) / size);
+
+    int iy_start = rank * chunk_size;
+    int iy_end = iy_start + chunk_size;
+
+    // Do not process boundaries
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    // double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+#pragma acc parallel loop present(A, Anew, rhs) async
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+
+#pragma acc parallel loop present(A, Anew) async
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+        int iy_start_top = top * chunk_size;
+        int iy_end_top = iy_start_top + chunk_size;
+
+        // Do not process boundaries
+        iy_start_top = max(iy_start_top, 1);
+        iy_end_top = min(iy_end_top, ny - 1);
+
+        int iy_start_bottom = bottom * chunk_size;
+        int iy_end_bottom = iy_start_bottom + chunk_size;
+
+        // Do not process boundaries
+        iy_start_bottom = max(iy_start_bottom, 1);
+        iy_end_bottom = min(iy_end_bottom, ny - 1);
+
+        // Halo exchange
+#pragma acc host_data use_device(A)
+        {
+            nvshmemx_double_put_on_stream(
+                (double *)(A + iy_end_top * nx + ix_start),
+                (double *)(A + iy_start * nx + ix_start), (ix_end - ix_start), top,
+                (cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));
+            nvshmemx_double_put_on_stream(
+                (double *)(A + (iy_start_bottom - 1) * nx + ix_start),
+                (double *)(A + (iy_end - 1) * nx + ix_start), (ix_end - ix_start), bottom,
+                (cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));
+        }
+        nvshmemx_barrier_all_on_stream((cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));
+
+#pragma acc parallel loop present(A) async
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx]) wait
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            // printf( "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+            // (iter*4*(ix_end-ix_start)*sizeof(real))/(1024*1024*1024*mpi_time) );
+        }
+    } else {
+        errors = -1;
+    }
+
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    nvshmem_free(d_A);
+    return errors;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task5/poisson2d_serial.c b/4-GPU/HandsOn/Solution/C/task5/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab9a1f2ca159e1af0361f194c39e7231a1e0aa19
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task5/poisson2d_serial.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+
+#include "common.h"
+
+void poisson2d_serial(int rank, int iter_max, real tol, real* restrict const Aref,
+                      real* restrict const Anew, int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+#pragma acc data present(Aref, Anew, rhs)
+    {
+#pragma acc update device(Aref [0:nx * ny], rhs [0:nx * ny])
+        while (error > tol && iter < iter_max) {
+            error = 0.0;
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < ny - 1; ix++) {
+                    Anew[iy * nx + ix] =
+                        -0.25 *
+                        (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                              Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                    error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+                }
+            }
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < ny - 1; ix++) {
+                    Aref[iy * nx + ix] = Anew[iy * nx + ix];
+                }
+            }
+
+// Periodic boundary conditions
+#pragma acc parallel loop
+            for (int ix = 1; ix < ny - 1; ix++) {
+                Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+                Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+            }
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+                Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+            }
+
+            if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+            iter++;
+        }
+#pragma acc update self(Aref [0:nx * ny])
+    }
+}
+
+int check_results(int rank, int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "[MPI%d] ERROR: A[%d][%d] = %f does not match %f (reference)\n",
+                        rank, iy, ix, A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+#ifdef MPI_VERSION
+    int global_result_correct = 0;
+    MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    result_correct = global_result_correct;
+#endif  // MPI_VERSION
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task6/Makefile b/4-GPU/HandsOn/Solution/C/task6/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..9ef982901088aa0f7c3b2a51eda234d4904575c6
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task6/Makefile
@@ -0,0 +1,55 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+CC = mpicxx
+CFLAGS = -DUSE_DOUBLE 
+
+NVSHMEM_HOME=${HOME}/nvshmem-master/build
+NVSHMEM_LIBS= -L${NVSHMEM_HOME}/lib -lnvshmem -Mcuda -lcuda -lrt 
+NVSHMEM_INC = -I${NVSHMEM_HOME}/include
+
+ifeq ($(COMPILER),GCC)
+	CFLAGS += -std=c99 -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	CFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	CFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi
+
+TASK=6
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.c common.h  Makefile
+	$(CC) -c $(CFLAGS) poisson2d_serial.c -o poisson2d_serial.o
+
+poisson2d: poisson2d.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) ${NVSHMEM_INC} poisson2d.c poisson2d_serial.o -o poisson2d ${NVSHMEM_LIBS}
+
+poisson2d.solution: poisson2d.solution.c common.h poisson2d_serial.o Makefile
+	$(CC) $(CFLAGS) ${NVSHMEM_INC} poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution ${NVSHMEM_LIBS}
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof
diff --git a/4-GPU/HandsOn/Solution/C/task6/common.h b/4-GPU/HandsOn/Solution/C/task6/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..06d65fb3272fa9baa8a8f74e3d8208b76c0f19c8
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task6/common.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <assert.h>
+
+#ifdef USE_DOUBLE
+    typedef double real;
+    #define fmaxr fmax
+    #define fabsr fabs
+    #define expr exp
+    #define MPI_REAL_TYPE MPI_DOUBLE
+#else
+    typedef float real;
+    #define fmaxr fmaxf
+    #define fabsr fabsf
+    #define expr expf
+    #define MPI_REAL_TYPE MPI_FLOAT
+#endif
+
+typedef struct
+{
+    int y;
+    int x;
+} dim2;
+
+#define MAX_MPI_SIZE 16
+
+static dim2 size_to_size2d_map[MAX_MPI_SIZE+1] = { {0,0},
+    {1,1}, {2,1}, {3,1}, {2,2},
+    {5,1}, {3,2}, {7,1}, {4,2},
+    {3,3}, {5,2}, {11,1}, {6,2},
+    {13,1}, {7,2}, {5,3}, {4,4}
+};
+
+inline int min( int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline int max( int a, int b)
+{
+    return a > b ? a : b;
+}
+
+void poisson2d_serial( int rank, int iter_max, real tol, real* restrict const Aref, real* restrict const Anew, int nx, int ny, const real* restrict const rhs );
+
+int check_results( int rank, int ix_start, int ix_end,  int iy_start, int iy_end, real tol, const real* restrict const A, const real* restrict const Aref, int nx );
+
+static dim2 size_to_2Dsize( int size )
+{
+    assert(size<=MAX_MPI_SIZE);
+    return size_to_size2d_map[size];
+}
+
+#endif // COMMON_H
diff --git a/4-GPU/HandsOn/Solution/C/task6/pgprof.poisson2d.Task6.solution.tar.gz b/4-GPU/HandsOn/Solution/C/task6/pgprof.poisson2d.Task6.solution.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..76d04e69d5e9b13296c0465ba0b58f0cb9a9383d
Binary files /dev/null and b/4-GPU/HandsOn/Solution/C/task6/pgprof.poisson2d.Task6.solution.tar.gz differ
diff --git a/4-GPU/HandsOn/Solution/C/task6/poisson2d.c b/4-GPU/HandsOn/Solution/C/task6/poisson2d.c
new file mode 100644
index 0000000000000000000000000000000000000000..0486c22d58d3d92d543b1778c7dcbd474eb0c470
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task6/poisson2d.c
@@ -0,0 +1,245 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <mpi.h>
+#include <openacc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+// NVSHMEM
+#include <nvshmem.h>
+#include <nvshmemx.h>
+
+// Helper function to map existing device allocation to host allocation for NVSHMEM
+void map(real *restrict harr, real *restrict darr, int size) { acc_map_data(harr, darr, size); }
+
+int main(int argc, char **argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    int rank = 0;
+    int size = 1;
+
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // NVSHMEM
+    MPI_Comm mpi_comm = MPI_COMM_WORLD;
+    nvshmemx_init_attr_t attr;
+    attr.mpi_comm = &mpi_comm;
+    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
+
+#pragma acc set device_num(rank)
+
+    real *restrict const A = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Aref = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Anew = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const rhs = (real *)malloc(nx * ny * sizeof(real));
+
+    // NVSHMEM
+    real *d_A = (real *)nvshmem_malloc(nx * ny * sizeof(real));
+    map(A, d_A, nx * ny * sizeof(real));
+
+    // TODO: Get nvhsmem_ptr to the d_A allocation of the top and bottom PE
+    // use nvshmem_ptr(void* ptr, int pe)
+    // real * restrict d_Atop =
+    // real * restrict d_Abottom =
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    // Ensure correctness if ny%size != 0
+    int chunk_size = ceil((1.0 * ny) / size);
+
+    int iy_start = rank * chunk_size;
+    int iy_end = iy_start + chunk_size;
+
+    // Do not process boundaries
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    
+    //TODO: measuring the MPI time with asynchronous compute is not well defined. Remove it here and belows.
+    double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+   // TODO: Optional: Execute asynchronously where possible
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+//TODO: Check which parts in the while loop can be executed asynchronously by adding the async keyword
+//You might also need to use wait
+#pragma acc parallel loop present(A, Anew, rhs)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+
+        // TODO: if on upper or lower boundary also directly write in the top / bottom halo region
+        // you need to use an acc deviceptr clause to directlt use d_Atop, d_Abottom in the kernel
+#pragma acc parallel loop present(A, Anew)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+            }
+        }
+
+        // Periodic boundary conditions
+        int top = (rank == 0) ? (size - 1) : rank - 1;
+        int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+        int iy_start_top = top * chunk_size;
+        int iy_end_top = iy_start_top + chunk_size;
+
+        // Do not process boundaries
+        iy_start_top = max(iy_start_top, 1);
+        iy_end_top = min(iy_end_top, ny - 1);
+
+        int iy_start_bottom = bottom * chunk_size;
+        int iy_end_bottom = iy_start_bottom + chunk_size;
+
+        // Do not process boundaries
+        iy_start_bottom = max(iy_start_bottom, 1);
+        iy_end_bottom = min(iy_end_bottom, ny - 1);
+
+        // Halo exchange
+#pragma acc host_data use_device(A)
+        {
+            double start_mpi = MPI_Wtime();
+
+            // TODO: Remove the explicit put as this is no longer needed
+            nvshmem_double_put((double *)(A + iy_end_top * nx + ix_start),
+                               (double *)(A + iy_start * nx + ix_start), (ix_end - ix_start), top);
+            nvshmem_double_put((double *)(A + (iy_start_bottom - 1) * nx + ix_start),
+                               (double *)(A + (iy_end - 1) * nx + ix_start), (ix_end - ix_start),
+                               bottom);
+            //TODO: When using async get the CUDA stream that corresponds to the OpenACC default async stream
+            // replace the barrier with one on the stream, use nvshmemx_barrier_on_stream
+            nvshmem_barrier_all();
+            mpi_time += MPI_Wtime() - start_mpi;
+        }
+
+#pragma acc parallel loop present(A)
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
+        }
+    } else {
+        errors = -1;
+    }
+
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    nvshmem_free(d_A);
+    return errors;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task6/poisson2d.solution.c b/4-GPU/HandsOn/Solution/C/task6/poisson2d.solution.c
new file mode 100644
index 0000000000000000000000000000000000000000..8deb2b2670c68e8ad43369b3d599f500a070e613
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task6/poisson2d.solution.c
@@ -0,0 +1,232 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <mpi.h>
+#include <openacc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+// NVSHMEM
+#include <nvshmem.h>
+#include <nvshmemx.h>
+
+// Helper function to map existing device allocation to host allocation for NVSHMEM
+void map(real *restrict harr, real *restrict darr, int size) { acc_map_data(harr, darr, size); }
+
+int main(int argc, char **argv) {
+    int ny = 4096;
+    int nx = 4096;
+    int iter_max = 1000;
+    const real tol = 1.0e-5;
+
+    if (argc == 2) {
+        iter_max = atoi(argv[1]);
+    }
+
+    int rank = 0;
+    int size = 1;
+
+    // Initialize MPI and determine rank and size
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // NVSHMEM
+    MPI_Comm mpi_comm = MPI_COMM_WORLD;
+    nvshmemx_init_attr_t attr;
+    attr.mpi_comm = &mpi_comm;
+    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
+
+#pragma acc set device_num(rank)
+
+    real *restrict const A = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Aref = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const Anew = (real *)malloc(nx * ny * sizeof(real));
+    real *restrict const rhs = (real *)malloc(nx * ny * sizeof(real));
+
+    // NVSHMEM
+    real *d_A = (real *)nvshmem_malloc(nx * ny * sizeof(real));
+    map(A, d_A, nx * ny * sizeof(real));
+
+    int top = (rank == 0) ? (size - 1) : rank - 1;
+    int bottom = (rank == (size - 1)) ? 0 : rank + 1;
+    real * restrict d_Atop = (real *)nvshmem_ptr(d_A, top);
+    real * restrict d_Abottom = (real *)nvshmem_ptr(d_A, bottom);
+
+    // set rhs
+    for (int iy = 1; iy < ny - 1; iy++) {
+        for (int ix = 1; ix < nx - 1; ix++) {
+            const real x = -1.0 + (2.0 * ix / (nx - 1));
+            const real y = -1.0 + (2.0 * iy / (ny - 1));
+            rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
+        }
+    }
+
+#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
+
+    int ix_start = 1;
+    int ix_end = (nx - 1);
+
+    // Ensure correctness if ny%size != 0
+    int chunk_size = ceil((1.0 * ny) / size);
+
+    int iy_start = rank * chunk_size;
+    int iy_end = iy_start + chunk_size;
+
+    // Do not process boundaries
+    iy_start = max(iy_start, 1);
+    iy_end = min(iy_end, ny - 1);
+
+// OpenACC Warm-up
+#pragma acc parallel loop present(A, Aref)
+    for (int iy = 0; iy < ny; iy++) {
+        for (int ix = 0; ix < nx; ix++) {
+            Aref[iy * nx + ix] = 0.0;
+            A[iy * nx + ix] = 0.0;
+        }
+    }
+
+    // Wait for all processes to finish Warm-up
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
+
+    double runtime_serial = 0.0;
+    if (rank == 0) {
+        printf("Calculate reference solution and time serial execution.\n");
+        // Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        double start = MPI_Wtime();
+        poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
+        runtime_serial = MPI_Wtime() - start;
+    }
+    MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
+
+    // Wait for all processes to ensure correct timing of the parallel version
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (rank == 0) printf("Parallel execution.\n");
+    
+    //TODO: measuring the MPI time with asynchronous compute is not well defined. Remove it here and belows.
+    double mpi_time = 0.0;
+    double start = MPI_Wtime();
+    int iter = 0;
+    real error = 1.0;
+
+    // Periodic boundary conditions
+    int iy_start_top = top * chunk_size;
+    int iy_end_top = iy_start_top + chunk_size;
+
+    // Do not process boundaries
+    iy_start_top = max(iy_start_top, 1);
+    iy_end_top = min(iy_end_top, ny - 1);
+
+    int iy_start_bottom = bottom * chunk_size;
+    int iy_end_bottom = iy_start_bottom + chunk_size;
+
+    // Do not process boundaries
+    iy_start_bottom = max(iy_start_bottom, 1);
+    iy_end_bottom = min(iy_end_bottom, ny - 1);
+
+#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
+                          rhs [iy_start * nx:(iy_end - iy_start) * nx])
+    while (error > tol && iter < iter_max) {
+        error = 0.0;
+
+//TODO: Check which parts in the while loop can be executed asynchronously by adding the async keyword
+//You might also need to use wait
+#pragma acc parallel loop present(A, Anew, rhs) async
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                Anew[iy * nx + ix] =
+                    -0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
+                                                  A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
+                error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
+            }
+        }
+
+        real globalerror = 0.0;
+        MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
+        error = globalerror;
+
+#pragma acc parallel loop present(A, Anew) deviceptr(d_Atop, d_Abottom) async
+        for (int iy = iy_start; iy < iy_end; iy++) {
+           
+            for (int ix = ix_start; ix < ix_end; ix++) {
+                A[iy * nx + ix] = Anew[iy * nx + ix];
+                if(iy == iy_start){
+                    d_Atop[iy_end_top * nx + ix] = Anew[iy * nx + ix];
+                }
+                if(iy == iy_end -1){
+                    d_Abottom[(iy_start_bottom - 1) * nx + ix]  =  Anew[iy * nx + ix];
+                }
+            }
+        }
+
+            nvshmemx_barrier_all_on_stream((cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));
+
+#pragma acc parallel loop present(A) async
+        for (int iy = iy_start; iy < iy_end; iy++) {
+            A[iy * nx + 0] = A[iy * nx + (nx - 2)];
+            A[iy * nx + (nx - 1)] = A[iy * nx + 1];
+        }
+
+        if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+        iter++;
+    }
+#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx]) wait
+    MPI_Barrier(MPI_COMM_WORLD);
+    double runtime = MPI_Wtime() - start;
+
+    int errors = 0;
+    if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
+        if (rank == 0) {
+            printf("Num GPUs: %d.\n", size);
+            printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
+                   ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
+                   runtime_serial / (size * runtime) * 100);
+            printf(
+                "MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
+                (iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
+        }
+    } else {
+        errors = -1;
+    }
+
+#pragma acc exit data delete (A, Aref, Anew, rhs)
+    MPI_Finalize();
+
+    free(rhs);
+    free(Anew);
+    free(Aref);
+    free(A);
+    nvshmem_free(d_A);
+    return errors;
+}
diff --git a/4-GPU/HandsOn/Solution/C/task6/poisson2d_serial.c b/4-GPU/HandsOn/Solution/C/task6/poisson2d_serial.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab9a1f2ca159e1af0361f194c39e7231a1e0aa19
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/C/task6/poisson2d_serial.c
@@ -0,0 +1,98 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <math.h>
+#include <stdio.h>
+
+#include "common.h"
+
+void poisson2d_serial(int rank, int iter_max, real tol, real* restrict const Aref,
+                      real* restrict const Anew, int nx, int ny, const real* restrict const rhs) {
+    int iter = 0;
+    real error = 1.0;
+#pragma acc data present(Aref, Anew, rhs)
+    {
+#pragma acc update device(Aref [0:nx * ny], rhs [0:nx * ny])
+        while (error > tol && iter < iter_max) {
+            error = 0.0;
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < ny - 1; ix++) {
+                    Anew[iy * nx + ix] =
+                        -0.25 *
+                        (rhs[iy * nx + ix] - (Aref[iy * nx + (ix + 1)] + Aref[iy * nx + ix - 1] +
+                                              Aref[(iy - 1) * nx + ix] + Aref[(iy + 1) * nx + ix]));
+                    error = fmaxr(error, fabsr(Anew[iy * nx + ix] - Aref[iy * nx + ix]));
+                }
+            }
+
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                for (int ix = 1; ix < ny - 1; ix++) {
+                    Aref[iy * nx + ix] = Anew[iy * nx + ix];
+                }
+            }
+
+// Periodic boundary conditions
+#pragma acc parallel loop
+            for (int ix = 1; ix < ny - 1; ix++) {
+                Aref[0 * nx + ix] = Aref[(ny - 2) * nx + ix];
+                Aref[(ny - 1) * nx + ix] = Aref[1 * nx + ix];
+            }
+#pragma acc parallel loop
+            for (int iy = 1; iy < ny - 1; iy++) {
+                Aref[iy * nx + 0] = Aref[iy * nx + (nx - 2)];
+                Aref[iy * nx + (nx - 1)] = Aref[iy * nx + 1];
+            }
+
+            if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
+
+            iter++;
+        }
+#pragma acc update self(Aref [0:nx * ny])
+    }
+}
+
+int check_results(int rank, int ix_start, int ix_end, int iy_start, int iy_end, real tol,
+                  const real* restrict const A, const real* restrict const Aref, int nx) {
+    int result_correct = 1;
+    for (int iy = iy_start; iy < iy_end && (result_correct == 1); iy++) {
+        for (int ix = ix_start; ix < ix_end && (result_correct == 1); ix++) {
+            if (fabs(Aref[iy * nx + ix] - A[iy * nx + ix]) >= tol) {
+                fprintf(stderr, "[MPI%d] ERROR: A[%d][%d] = %f does not match %f (reference)\n",
+                        rank, iy, ix, A[iy * nx + ix], Aref[iy * nx + ix]);
+                result_correct = 0;
+            }
+        }
+    }
+#ifdef MPI_VERSION
+    int global_result_correct = 0;
+    MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    result_correct = global_result_correct;
+#endif  // MPI_VERSION
+    return result_correct;
+}
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task0/Makefile b/4-GPU/HandsOn/Solution/FORTRAN/task0/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..e00f4b787321ba191f3e6565a066863e0f8812bb
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/FORTRAN/task0/Makefile
@@ -0,0 +1,53 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+FC = pgfortran
+ifeq ($(COMPILER),GCC)
+FCFLAGS = -freal-4-real-8 -DMPI_REAL_TYPE=MPI_REAL8
+else
+FCFLAGS = -r8 -DMPI_REAL_TYPE=MPI_REAL8
+endif
+ifeq ($(COMPILER),GCC)
+	FCFLAGS += -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI)
+	FCFLAGS += -fast
+else ifeq ($(COMPILER),PGI-tesla)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,managed
+else ifeq ($(COMPILER),PGI-multicore)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+
+TASK=0
+NP ?= 1
+PGPROF=pgprof -f --cpu-profiling off
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.F03 Makefile
+	$(FC) -c $(FCFLAGS) poisson2d_serial.F03 -o poisson2d_serial.o
+
+poisson2d: poisson2d.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.F03 poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.solution.F03 poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.solution.pgprof poisson2d.pgprof
+
+run: poisson2d
+	${SC19_SUBMIT_CMD} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD} ${PGPROF}  -f -o ${SC19_DIR_SCRATCH}/poisson2d.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.pgprof .
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD} ${PGPROF}  -o ${SC19_DIR_SCRATCH}/poisson2d.solution.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.pgprof .
\ No newline at end of file
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task0/poisson2d.F03 b/4-GPU/HandsOn/Solution/FORTRAN/task0/poisson2d.F03
new file mode 100644
index 0000000000000000000000000000000000000000..2ba3475ba8bfb1b5bbb9e85c681d8f8b4715f5cb
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/FORTRAN/task0/poisson2d.F03
@@ -0,0 +1,149 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+PROGRAM poisson2d
+    IMPLICIT NONE
+    INTEGER, PARAMETER :: MAX_ITER_MAX_DIGITS = 512
+    INTEGER, PARAMETER :: nx = 2048
+    INTEGER, PARAMETER :: ny = 2048
+    REAL, PARAMETER :: tol = 1.0E-5
+    INTEGER :: i,ix, iy, ix_start, ix_end, iy_start, iy_end, iter, iter_max, ierror
+    REAL :: x,y, error
+    REAL*8 :: runtime_cpu, runtime, start, finish
+    LOGICAL, EXTERNAL :: check_results
+    LOGICAL :: errors
+    REAL, DIMENSION(:,:), ALLOCATABLE :: a, a_ref, a_new, rhs
+    CHARACTER(MAX_ITER_MAX_DIGITS) :: iter_max_arg
+    
+    call getarg(1, iter_max_arg)
+    
+    IF ( iter_max_arg == '' ) THEN
+        iter_max = 500
+    ELSE
+        read (iter_max_arg, *) iter_max
+    ENDIF
+    
+    ALLOCATE( a(nx,ny) )
+    ALLOCATE( a_ref(nx,ny) )
+    ALLOCATE( a_new(nx,ny) )
+    ALLOCATE( rhs(nx,ny) )
+    
+    a = 0.0
+    a_ref = 0.0
+    
+    DO iy = 2, ny-1
+        DO ix = 2, nx-1
+            x = -1.0 + (2.0*ix/(nx-1.0))
+            y = -1.0 + (2.0*iy/(ny-1.0))
+            rhs(ix,iy) = EXP(-10.0*(x*x+y*y))
+        END DO
+    END DO
+    
+    ix_start = 2
+    ix_end   = nx-1
+    
+    iy_start = 2
+    iy_end = ny-1
+    
+    !OpenACC Warm-up
+    !$acc parallel loop
+    DO iy = 1, ny
+        DO ix = 1, nx
+            a(ix,iy) = 0.0
+        END DO
+    END DO
+    !$acc end parallel
+        
+
+    WRITE(*,"('Jacobi relaxation Calculation: ',I4,' x ',I4,' mesh')") nx,ny
+    WRITE(*,*) 'Calculate reference solution and time serial CPU execution.'
+    call cpu_time(start)
+    CALL poisson2d_serial( nx, ny, iter_max, tol, a_ref, a_new, rhs )
+    call cpu_time(finish)
+    runtime_cpu = finish-start
+    
+    WRITE(*,*) 'GPU execution.'
+    
+    call cpu_time(start)
+    iter = 1
+    error = 1.0
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        !TODO: Parallelize loop nest with OpenACC
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a(ix+1,iy) + a(ix-1,iy) + a(ix,iy-1) + a(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a(ix,iy) ) )
+            END DO
+        END DO
+        
+        
+        !TODO: Parallelize loop nest with OpenACC
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        
+        
+        !TODO: Parallelize loop nest with OpenACC
+        DO ix = ix_start, ix_end
+            a(ix,1) = a(ix,ny-1)
+            a(ix,ny) = a(ix,2)
+        END DO
+        
+        !TODO: Parallelize loop nest with OpenACC
+        DO iy = iy_start, iy_end
+            a(1,iy) = a(nx-1,iy)
+            a(nx,iy) = a(2,iy)
+        END DO
+        
+
+        IF ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+    call cpu_time(finish)
+    runtime = finish-start
+    
+    errors = .FALSE.
+    IF ( check_results( ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref ) ) THEN
+        WRITE(*,"(I4,'x',I4,': 1 CPU: ',F8.4,' s 1 GPU: ',F8.4,' s, speedup: ',F8.2)"), &
+              nx,ny,runtime_cpu,runtime,runtime_cpu/runtime
+    ELSE
+        errors = .TRUE.
+    END IF
+
+    DEALLOCATE( rhs )
+    DEALLOCATE( a_new )
+    DEALLOCATE( a_ref )
+    DEALLOCATE( a )
+    IF ( errors ) THEN
+        STOP -1
+    END IF
+END PROGRAM poisson2d
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task0/poisson2d.solution.pgprof b/4-GPU/HandsOn/Solution/FORTRAN/task0/poisson2d.solution.pgprof
index 5113b61cae3788a37e12c1e98aac803cd28ac6a9..b7b6e9e9a3e3878afb9fdbdd6366767ac831c4cc 100644
Binary files a/4-GPU/HandsOn/Solution/FORTRAN/task0/poisson2d.solution.pgprof and b/4-GPU/HandsOn/Solution/FORTRAN/task0/poisson2d.solution.pgprof differ
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task0/poisson2d_serial.F03 b/4-GPU/HandsOn/Solution/FORTRAN/task0/poisson2d_serial.F03
new file mode 100644
index 0000000000000000000000000000000000000000..da5f37d95e2032ff21053c0f80f934a9685b8a23
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/FORTRAN/task0/poisson2d_serial.F03
@@ -0,0 +1,106 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+SUBROUTINE poisson2d_serial(nx, ny, iter_max ,tol,a_ref, a_new,rhs)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: nx, ny, iter_max
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(INOUT) :: a_ref, a_new
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: rhs
+    INTEGER :: ix,iy, iter
+    REAL :: error
+    
+    iter = 1
+    error = 1.0
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a_ref(ix+1,iy) + a_ref(ix-1,iy) + a_ref(ix,iy-1) + a_ref(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a_ref(ix,iy) ) )
+            END DO
+        END DO
+        
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_ref(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        
+        DO ix = 2, nx-1
+            a_ref(ix,1) = a_ref(ix,ny-1)
+            a_ref(ix,ny) = a_ref(ix,2)
+        END DO
+        
+        DO iy = 2, ny-1
+            a_ref(1,iy) = a_ref(nx-1,iy)
+            a_ref(nx,iy) = a_ref(2,iy)
+        END DO
+        
+        IF ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+END SUBROUTINE poisson2d_serial
+
+LOGICAL FUNCTION check_results( ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref )
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: ix_start, ix_end, iy_start, iy_end, nx, ny
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: a, a_ref
+    INTEGER :: ix,iy,ierror
+    LOGICAL :: no_errors, global_no_errors, mpi_is_initialized
+    no_errors = .TRUE.
+    iy = iy_start
+    ix = ix_start
+    DO WHILE ( iy <= iy_end .AND. no_errors )
+        DO WHILE ( ix <= ix_end .AND. no_errors )
+            IF ( ABS( a_ref(ix,iy) - a(ix,iy)) >= tol ) THEN
+                WRITE(*,"('ERROR: a(',I4,',',I4,') = ',F8.6,' does not match ',F8.6,' (reference)')") &
+                    ix,iy,a(ix,iy),a_ref(ix,iy)
+                no_errors = .FALSE.
+            END IF
+            ix = ix + 1
+        END DO
+        iy = iy + 1
+    END DO
+    check_results = no_errors
+END FUNCTION check_results
+
+SUBROUTINE size_to_2Dsize(mpi_size, mpi_sizex, mpi_sizey)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: mpi_size
+    INTEGER, INTENT(OUT) :: mpi_sizex, mpi_sizey
+    INTEGER, DIMENSION(2,16), PARAMETER :: size_to_size2d_map = reshape( &
+        (/ 1,1 , 2,1 , 3,1  , 2,2 , &
+           5,1 , 3,2 , 7,1  , 4,2 , &
+           3,3 , 5,2 , 11,1 , 6,2 , &
+          13,1 , 7,2 , 5,3  , 4,4 /), (/ 2, 16 /) )
+    mpi_sizex = size_to_size2d_map(2,mpi_size)
+    mpi_sizey = size_to_size2d_map(1,mpi_size)
+END SUBROUTINE size_to_2Dsize
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task1/Makefile b/4-GPU/HandsOn/Solution/FORTRAN/task1/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..2bfb79cdedc68ae33659e1149f62aad6f2741a52
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/FORTRAN/task1/Makefile
@@ -0,0 +1,62 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+FC = pgfortran
+ifeq ($(COMPILER),GCC)
+FCFLAGS = -freal-4-real-8 -DMPI_REAL_TYPE=MPI_REAL8
+else
+FCFLAGS = -r8 -DMPI_REAL_TYPE=MPI_REAL8
+endif
+ifeq ($(COMPILER),GCC)
+	FCFLAGS += -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI)
+	FCFLAGS += -fast
+else ifeq ($(COMPILER),PGI-tesla)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,managed,lineinfo
+else ifeq ($(COMPILER),PGI-multicore)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+
+TASK=1
+NP ?= 1
+PGPROF=pgprof -f --cpu-profiling off
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.F03 Makefile
+	$(FC) -c $(FCFLAGS) poisson2d_serial.F03 -o poisson2d_serial.o
+
+poisson2d: poisson2d.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.F03 poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.solution.F03 poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.solution.*.pgprof poisson2d.*.pgprof *.tar.gz *.csv
+
+run: poisson2d
+	${SC19_SUBMIT_CMD} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.timeline.pgprof ./poisson2d 3
+	${SC19_SUBMIT_CMD} ${PGPROF} --analysis-metrics -o ${SC19_DIR_SCRATCH}/poisson2d.metrics.pgprof ./poisson2d 3
+	${SC19_SUBMIT_CMD} ${PGPROF}  --metrics gld_efficiency,gst_efficiency -o ${SC19_DIR_SCRATCH}/poisson2d.efficiency.pgprof ./poisson2d 3
+	pgprof --csv -i ${SC19_DIR_SCRATCH}/poisson2d.efficiency.pgprof 2>&1 | grep -v "======" > poisson2d.efficiency.csv
+	mv ${SC19_DIR_SCRATCH}/poisson2d.*.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.timeline.pgprof poisson2d.metrics.pgprof
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+		${SC19_SUBMIT_CMD} ${PGPROF}  -o ${SC19_DIR_SCRATCH}/poisson2d.solution.timeline.pgprof ./poisson2d.solution 3
+		${SC19_SUBMIT_CMD} ${PGPROF}  --analysis-metrics -o ${SC19_DIR_SCRATCH}/poisson2d.solution.metrics.pgprof ./poisson2d.solution 3
+		${SC19_SUBMIT_CMD} ${PGPROF}  --metrics gld_efficiency,gst_efficiency -o ${SC19_DIR_SCRATCH}/poisson2d.solution.efficiency.pgprof ./poisson2d.solution 3
+		pgprof --csv -i ${SC19_DIR_SCRATCH}/poisson2d.solution.efficiency.pgprof 2>&1 | grep -v "======" > poisson2d.solution.efficiency.csv
+		mv ${SC19_DIR_SCRATCH}/poisson2d.solution.*.pgprof .
+		tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.*.pgprof
+		
\ No newline at end of file
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task1/pgprof.poisson2d.Task1.solution.tar.gz b/4-GPU/HandsOn/Solution/FORTRAN/task1/pgprof.poisson2d.Task1.solution.tar.gz
index 9971df28716fcf7c3dc00c1a35dc40850dc956e3..1acf8dfa0bc423c4a58ec85618e7af721a93bc2f 100644
Binary files a/4-GPU/HandsOn/Solution/FORTRAN/task1/pgprof.poisson2d.Task1.solution.tar.gz and b/4-GPU/HandsOn/Solution/FORTRAN/task1/pgprof.poisson2d.Task1.solution.tar.gz differ
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task1/poisson2d.F03 b/4-GPU/HandsOn/Solution/FORTRAN/task1/poisson2d.F03
new file mode 100644
index 0000000000000000000000000000000000000000..a88858fb8c00fd8eab7d4b65e047c65bc7f3da98
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/FORTRAN/task1/poisson2d.F03
@@ -0,0 +1,150 @@
+! Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+PROGRAM poisson2d
+    IMPLICIT NONE
+    INTEGER, PARAMETER :: MAX_ITER_MAX_DIGITS = 512
+    INTEGER, PARAMETER :: nx = 2048
+    INTEGER, PARAMETER :: ny = 2048
+    REAL, PARAMETER :: tol = 1.0E-5
+    INTEGER :: i,ix, iy, ix_start, ix_end, iy_start, iy_end, iter, iter_max, ierror
+    REAL :: x,y, error
+    REAL*8 :: runtime_cpu, runtime, start, finish
+    LOGICAL, EXTERNAL :: check_results
+    LOGICAL :: errors
+    REAL, DIMENSION(:,:), ALLOCATABLE :: a, a_ref, a_new, rhs
+    CHARACTER(MAX_ITER_MAX_DIGITS) :: iter_max_arg
+    
+    call getarg(1, iter_max_arg)
+    
+    IF ( iter_max_arg == '' ) THEN
+        iter_max = 500
+    ELSE
+        read (iter_max_arg, *) iter_max
+    ENDIF
+    
+    ALLOCATE( a(nx,ny) )
+    ALLOCATE( a_ref(nx,ny) )
+    ALLOCATE( a_new(nx,ny) )
+    ALLOCATE( rhs(nx,ny) )
+    
+    a = 0.0
+    a_ref = 0.0
+    
+    DO iy = 2, ny-1
+        DO ix = 2, nx-1
+            x = -1.0 + (2.0*ix/(nx-1.0))
+            y = -1.0 + (2.0*iy/(ny-1.0))
+            rhs(ix,iy) = EXP(-10.0*(x*x+y*y))
+        END DO
+    END DO
+    
+    ix_start = 2
+    ix_end   = nx-1
+    
+    iy_start = 2
+    iy_end = ny-1
+    
+    !OpenACC Warm-up
+    !$acc parallel loop
+    DO iy = 1, ny
+        DO ix = 1, nx
+            a(ix,iy) = 0.0
+        END DO
+    END DO
+    !$acc end parallel
+        
+
+    WRITE(*,"('Jacobi relaxation Calculation: ',I4,' x ',I4,' mesh')") nx,ny
+    WRITE(*,*) 'Calculate reference solution and time serial CPU execution.'
+    call cpu_time(start)
+    CALL poisson2d_serial( nx, ny, iter_max, tol, a_ref, a_new, rhs )
+    call cpu_time(finish)
+    runtime_cpu = finish-start
+    
+    WRITE(*,*) 'GPU execution.'
+    
+    call cpu_time(start)
+    iter = 1
+    error = 1.0
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        !$acc parallel loop
+        DO ix = ix_start, ix_end
+            DO iy = iy_start, iy_end
+                !TODO: Fix memory access pattern
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a(ix+1,iy) + a(ix-1,iy) + a(ix,iy-1) + a(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a(ix,iy) ) )
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO ix = ix_start, ix_end
+            a(ix,1) = a(ix,ny-1)
+            a(ix,ny) = a(ix,2)
+        END DO
+        !$acc end parallel
+        !$acc parallel loop
+        DO iy = iy_start, iy_end
+            a(1,iy) = a(nx-1,iy)
+            a(nx,iy) = a(2,iy)
+        END DO
+        !$acc end parallel
+
+        IF ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+    call cpu_time(finish)
+    runtime = finish-start
+    
+    errors = .FALSE.
+    IF ( check_results( ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref ) ) THEN
+        WRITE(*,"(I4,'x',I4,': 1 CPU: ',F8.4,' s 1 GPU: ',F8.4,' s, speedup: ',F8.2)"), &
+              nx,ny,runtime_cpu,runtime,runtime_cpu/runtime
+    ELSE
+        errors = .TRUE.
+    END IF
+
+    DEALLOCATE( rhs )
+    DEALLOCATE( a_new )
+    DEALLOCATE( a_ref )
+    DEALLOCATE( a )
+    IF ( errors ) THEN
+        STOP -1
+    END IF
+END PROGRAM poisson2d
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task1/poisson2d_serial.F03 b/4-GPU/HandsOn/Solution/FORTRAN/task1/poisson2d_serial.F03
new file mode 100644
index 0000000000000000000000000000000000000000..da5f37d95e2032ff21053c0f80f934a9685b8a23
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/FORTRAN/task1/poisson2d_serial.F03
@@ -0,0 +1,106 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+SUBROUTINE poisson2d_serial(nx, ny, iter_max ,tol,a_ref, a_new,rhs)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: nx, ny, iter_max
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(INOUT) :: a_ref, a_new
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: rhs
+    INTEGER :: ix,iy, iter
+    REAL :: error
+    
+    iter = 1
+    error = 1.0
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a_ref(ix+1,iy) + a_ref(ix-1,iy) + a_ref(ix,iy-1) + a_ref(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a_ref(ix,iy) ) )
+            END DO
+        END DO
+        
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_ref(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        
+        DO ix = 2, nx-1
+            a_ref(ix,1) = a_ref(ix,ny-1)
+            a_ref(ix,ny) = a_ref(ix,2)
+        END DO
+        
+        DO iy = 2, ny-1
+            a_ref(1,iy) = a_ref(nx-1,iy)
+            a_ref(nx,iy) = a_ref(2,iy)
+        END DO
+        
+        IF ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+END SUBROUTINE poisson2d_serial
+
+LOGICAL FUNCTION check_results( ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref )
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: ix_start, ix_end, iy_start, iy_end, nx, ny
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: a, a_ref
+    INTEGER :: ix,iy,ierror
+    LOGICAL :: no_errors, global_no_errors, mpi_is_initialized
+    no_errors = .TRUE.
+    iy = iy_start
+    ix = ix_start
+    DO WHILE ( iy <= iy_end .AND. no_errors )
+        DO WHILE ( ix <= ix_end .AND. no_errors )
+            IF ( ABS( a_ref(ix,iy) - a(ix,iy)) >= tol ) THEN
+                WRITE(*,"('ERROR: a(',I4,',',I4,') = ',F8.6,' does not match ',F8.6,' (reference)')") &
+                    ix,iy,a(ix,iy),a_ref(ix,iy)
+                no_errors = .FALSE.
+            END IF
+            ix = ix + 1
+        END DO
+        iy = iy + 1
+    END DO
+    check_results = no_errors
+END FUNCTION check_results
+
+SUBROUTINE size_to_2Dsize(mpi_size, mpi_sizex, mpi_sizey)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: mpi_size
+    INTEGER, INTENT(OUT) :: mpi_sizex, mpi_sizey
+    INTEGER, DIMENSION(2,16), PARAMETER :: size_to_size2d_map = reshape( &
+        (/ 1,1 , 2,1 , 3,1  , 2,2 , &
+           5,1 , 3,2 , 7,1  , 4,2 , &
+           3,3 , 5,2 , 11,1 , 6,2 , &
+          13,1 , 7,2 , 5,3  , 4,4 /), (/ 2, 16 /) )
+    mpi_sizex = size_to_size2d_map(2,mpi_size)
+    mpi_sizey = size_to_size2d_map(1,mpi_size)
+END SUBROUTINE size_to_2Dsize
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task2/Makefile b/4-GPU/HandsOn/Solution/FORTRAN/task2/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..46380b18350a61f0fcc8297db6a6c423f99343ee
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/FORTRAN/task2/Makefile
@@ -0,0 +1,54 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+FC = mpifort
+ifeq ($(COMPILER),GCC)
+FCFLAGS = -freal-4-real-8 -DMPI_REAL_TYPE=MPI_REAL8
+else
+FCFLAGS = -r8 -DMPI_REAL_TYPE=MPI_REAL8
+endif
+ifeq ($(COMPILER),GCC)
+	FCFLAGS += -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --annotate-mpi openmpi
+
+TASK=2
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.F03 Makefile
+	$(FC) -c $(FCFLAGS) poisson2d_serial.F03 -o poisson2d_serial.o
+
+poisson2d: poisson2d.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.F03 poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.solution.F03 poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof 
\ No newline at end of file
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task2/pgprof.poisson2d.Task2.solution.tar.gz b/4-GPU/HandsOn/Solution/FORTRAN/task2/pgprof.poisson2d.Task2.solution.tar.gz
index a9ed0efcbf727d80ff218e05ae7fd686d92d9b86..1d73be120f61bac844ff4867acd24595b81f399d 100644
Binary files a/4-GPU/HandsOn/Solution/FORTRAN/task2/pgprof.poisson2d.Task2.solution.tar.gz and b/4-GPU/HandsOn/Solution/FORTRAN/task2/pgprof.poisson2d.Task2.solution.tar.gz differ
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task2/poisson2d.F03 b/4-GPU/HandsOn/Solution/FORTRAN/task2/poisson2d.F03
new file mode 100644
index 0000000000000000000000000000000000000000..c5f5ae58e54e78da0cc2c39d338f1d0eeadc91ca
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/FORTRAN/task2/poisson2d.F03
@@ -0,0 +1,239 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+PROGRAM poisson2d
+#if _OPENACC
+    USE openacc
+#endif
+    USE mpi
+    IMPLICIT NONE
+    INTEGER, PARAMETER :: MAX_ITER_MAX_DIGITS = 512
+    INTEGER, PARAMETER :: nx = 4096
+    INTEGER, PARAMETER :: ny = 4096
+    REAL, PARAMETER :: tol = 1.0E-5
+    INTEGER :: i,ix, iy, ix_start, ix_end, iy_start, iy_end, iter, iter_max, mpi_rank, mpi_size, device_type, ngpus, devicenum, ierror
+    INTEGER :: chunk_size, right, left
+    REAL :: x,y, error, globalerror
+    REAL*8 :: runtime_serial, runtime, start, finish, mpi_time, mpi_start_time
+    LOGICAL, EXTERNAL :: check_results
+    LOGICAL :: errors
+    REAL, DIMENSION(:,:), ALLOCATABLE :: a, a_ref, a_new, rhs
+    CHARACTER(MAX_ITER_MAX_DIGITS) :: iter_max_arg
+    
+    call getarg(1, iter_max_arg)
+    
+    IF ( iter_max_arg == '' ) THEN
+        iter_max = 1000
+    ELSE
+        read (iter_max_arg, *) iter_max
+    ENDIF
+    
+    mpi_rank = 0
+    mpi_size = 1
+    
+    !Initialize MPI and determine rank and size
+    CALL MPI_Init(ierror)
+    CALL MPI_Comm_rank(MPI_COMM_WORLD,mpi_rank,ierror)
+    CALL MPI_Comm_size(MPI_COMM_WORLD,mpi_size,ierror)
+    
+    !TODO: handle device affinity
+    
+    ALLOCATE( a(nx,ny) )
+    ALLOCATE( a_ref(nx,ny) )
+    ALLOCATE( a_new(nx,ny) )
+    ALLOCATE( rhs(nx,ny) )
+    
+    a = 0.0
+    a_ref = 0.0
+    
+    DO iy = 2, ny-1
+        DO ix = 2, nx-1
+            x = -1.0 + (2.0*ix/(nx-1.0))
+            y = -1.0 + (2.0*iy/(ny-1.0))
+            rhs(ix,iy) = EXP(-10.0*(x*x+y*y))
+        END DO
+    END DO
+    
+    !$acc enter data create(a,a_ref,a_new,rhs)
+    
+    ix_start = 2
+    ix_end   = nx-1
+    
+    !TODO: set first and last row to be processed by this rank.
+    iy_start = 2
+    iy_end = ny-1
+    
+    !OpenACC Warm-up
+    !$acc parallel loop present(a,a_ref)
+    DO iy = 1, ny
+        DO ix = 1, nx
+            a(ix,iy) = 0.0
+            a_ref(ix,iy) = 0.0
+        END DO
+    END DO
+    
+        
+    !MPI Warm-up to establish CUDA IPC connections
+    DO i = 1,2
+        left = mpi_rank-1
+        IF ( mpi_rank == 0 ) THEN
+            left = mpi_size-1
+        END IF
+        right = mpi_rank+1
+        IF ( mpi_rank == mpi_size-1 ) THEN
+            right = 0
+        END IF
+        !$acc host_data use_device( a )
+            !1. Sent column iy_start (first modified column) to left receive right boundary (iy_end+1) from right
+            CALL MPI_Sendrecv( a(ix_start,iy_start), (ix_end-ix_start)+1, MPI_REAL_TYPE, left   , 0, &
+                               a(ix_start,iy_end+1), (ix_end-ix_start)+1, MPI_REAL_TYPE, right, 0, &
+                               MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierror )
+
+            !2. Sent column iy_end (last modified column) to right receive left boundary (iy_start-1) from left
+            CALL MPI_Sendrecv( a(ix_start,iy_end), (ix_end-ix_start)+1, MPI_REAL_TYPE, right, 0, &
+                               a(ix_start,(iy_start-1)), (ix_end-ix_start)+1, MPI_REAL_TYPE, left   , 0, &
+                               MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierror )
+        !$acc end host_data
+    END DO
+    
+    !Wait for all processes to finish Warm-up
+    CALL MPI_Barrier( MPI_COMM_WORLD, ierror )
+
+    IF ( mpi_rank == 0 ) THEN
+        WRITE(*,"('Jacobi relaxation Calculation: ',I4,' x ',I4,' mesh')") nx,ny
+        WRITE(*,*) 'Calculate reference solution and time serial execution.'
+        !Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        start = MPI_WTIME()
+        CALL poisson2d_serial( nx, ny, iter_max, mpi_rank, tol, a_ref, a_new, rhs )
+        finish = MPI_WTIME()
+        runtime_serial = finish-start
+    END IF
+    CALL MPI_Bcast(a_ref, size(a_ref), MPI_REAL_TYPE, 0, MPI_COMM_WORLD, ierror)
+    
+    !Wait for all processes to ensure correct timing of the parallel version
+    CALL MPI_Barrier( MPI_COMM_WORLD, ierror )
+    
+    IF ( mpi_rank == 0 ) THEN
+        WRITE(*,*) 'Parallel execution.'
+    END IF 
+    
+    mpi_time = 0.0
+    start = MPI_WTIME()
+    iter = 1
+    error = 1.0
+    !$acc update device(a(1:nx,iy_start:iy_end),rhs(1:nx,iy_start:iy_end))
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        !$acc parallel loop present(a,a_new,rhs)
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a(ix+1,iy) + a(ix-1,iy) + a(ix,iy-1) + a(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a(ix,iy) ) )
+            END DO
+        END DO
+        !$acc end parallel
+        !Calculate global error across all ranks
+        globalerror = 0.0
+        call MPI_Allreduce( error, globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD, ierror )
+        error = globalerror
+        
+        !$acc parallel loop present(a,a_new)
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !TODO: Handle periodic boundary conditions and halo exchange with MPI
+        !$acc parallel loop
+        DO ix = ix_start, ix_end
+            a(ix,iy_start-1) = a(ix,iy_end)
+            a(ix,iy_end+1) = a(ix,iy_start)
+        END DO
+        !$acc end parallel
+        left = mpi_rank-1
+        IF ( mpi_rank == 0 ) THEN
+            left = mpi_size-1
+        END IF
+        right = mpi_rank+1
+        IF ( mpi_rank == mpi_size-1 ) THEN
+            right = 0
+        END IF
+        
+        mpi_start_time = MPI_WTIME()
+        !TODO: Pass device ptr of A to MPI using host_data use_device
+            !TODO: 1. Sent column iy_start (first modified column) to left receive right boundary (iy_end+1) from right
+            ! CALL MPI_SENDRECV(SENDBUF, SENDCOUNT, MPI_REAL_TYPE, DEST, 0, RECVBUF, RECVCOUNT, MPI_REAL_TYPE, SOURCE, 0, MPI_COMM_WORLD,MPI_STATUS_IGNORE, ierror)
+
+            !TODO: 2. Sent column iy_end (last modified column) to right receive left boundary (iy_start-1) from left
+            ! CALL MPI_SENDRECV(SENDBUF, SENDCOUNT, MPI_REAL_TYPE, DEST, 0, RECVBUF, RECVCOUNT, MPI_REAL_TYPE, SOURCE, 0, MPI_COMM_WORLD,MPI_STATUS_IGNORE, ierror)
+        !TODO: !$acc end host_data
+        mpi_time = (MPI_WTIME() - mpi_start_time) + mpi_time
+        
+        !$acc parallel loop present(a)
+        DO iy = iy_start, iy_end
+            a(1,iy) = a(nx-1,iy)
+            a(nx,iy) = a(2,iy)
+        END DO
+        !$acc end parallel
+
+        IF ( mpi_rank == 0 .AND. ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+    !$acc update self(a(1:nx,iy_start:iy_end))
+    !Wait for all processes to ensure correct timing of the parallel version
+    CALL MPI_Barrier( MPI_COMM_WORLD, ierror )
+    finish = MPI_WTIME()
+    runtime = finish-start
+    
+    errors = .FALSE.
+    IF ( check_results( mpi_rank, ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref ) ) THEN
+        IF ( mpi_rank == 0 ) THEN
+            WRITE(*,*) 'Num GPUs: ', mpi_size
+            WRITE(*,"(I4,'x',I4,': 1 GPU: ',F8.4,' s ',I1,' GPUs: ',F8.4,' s, speedup: ',F8.2,' efficiency: ',F8.2)"), &
+                  nx,ny,runtime_serial,mpi_size,runtime,runtime_serial/runtime,runtime_serial/(mpi_size*runtime)*100
+            WRITE(*,"('MPI time: 'F8.4' s, inter GPU BW: 'F8.2' GiB/s')"), &
+                  mpi_time,(iter*4*(ix_end-ix_start)*SIZEOF(a(1,1)))/(1024*1024*1024*mpi_time)
+        END IF
+    ELSE
+        errors = .TRUE.
+    END IF
+    
+    !$acc exit data delete(a,a_ref,a_new,rhs)
+    CALL MPI_Finalize(ierror)
+    
+    DEALLOCATE( rhs )
+    DEALLOCATE( a_new )
+    DEALLOCATE( a_ref )
+    DEALLOCATE( a )
+    IF ( errors ) THEN
+        STOP -1
+    END IF
+END PROGRAM poisson2d
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task2/poisson2d_serial.F03 b/4-GPU/HandsOn/Solution/FORTRAN/task2/poisson2d_serial.F03
new file mode 100644
index 0000000000000000000000000000000000000000..8a6e0a9f25deb2e6a615e3e1ba214f48d93a4ac1
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/FORTRAN/task2/poisson2d_serial.F03
@@ -0,0 +1,126 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+SUBROUTINE poisson2d_serial(nx, ny, iter_max, mpi_rank,tol,a_ref, a_new,rhs)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: nx, ny, iter_max, mpi_rank
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(INOUT) :: a_ref, a_new
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: rhs
+    INTEGER :: ix,iy, iter
+    REAL :: error
+    
+    iter = 1
+    error = 1.0
+    !$acc data present(a_ref,rhs,a_new)
+    !$acc update device(a_ref,rhs)
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        !$acc parallel loop
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a_ref(ix+1,iy) + a_ref(ix-1,iy) + a_ref(ix,iy-1) + a_ref(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a_ref(ix,iy) ) )
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_ref(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO ix = 2, nx-1
+            a_ref(ix,1) = a_ref(ix,ny-1)
+            a_ref(ix,ny) = a_ref(ix,2)
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO iy = 2, ny-1
+            a_ref(1,iy) = a_ref(nx-1,iy)
+            a_ref(nx,iy) = a_ref(2,iy)
+        END DO
+        !$acc end parallel
+        
+        IF ( mpi_rank == 0 .AND. ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+    !$acc update self(a_ref)
+    !$acc end data
+END SUBROUTINE poisson2d_serial
+
+LOGICAL FUNCTION check_results( mpi_rank, ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref )
+    USE mpi
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: mpi_rank, ix_start, ix_end, iy_start, iy_end, nx, ny
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: a, a_ref
+    INTEGER :: ix,iy,ierror
+    LOGICAL :: no_errors, global_no_errors, mpi_is_initialized
+    no_errors = .TRUE.
+    iy = iy_start
+    ix = ix_start
+    DO WHILE ( iy <= iy_end .AND. no_errors )
+        DO WHILE ( ix <= ix_end .AND. no_errors )
+            IF ( ABS( a_ref(ix,iy) - a(ix,iy)) >= tol ) THEN
+                WRITE(*,"('[MPI',I1,'] ERROR: a(',I4,',',I4,') = ',F8.6,' does not match ',F8.6,' (reference)')") &
+                    mpi_rank,ix,iy,a(ix,iy),a_ref(ix,iy)
+                no_errors = .FALSE.
+            END IF
+            ix = ix + 1
+        END DO
+        iy = iy + 1
+    END DO
+    
+    CALL MPI_Initialized(mpi_is_initialized, ierror)
+    IF ( mpi_is_initialized ) THEN
+        global_no_errors = .FALSE.
+        CALL MPI_ALLREDUCE(no_errors, global_no_errors, 1, MPI_LOGICAL, MPI_LAND, MPI_COMM_WORLD, ierror)
+        no_errors = global_no_errors
+    END IF
+    check_results = no_errors
+END FUNCTION check_results
+
+SUBROUTINE size_to_2Dsize(mpi_size, mpi_sizex, mpi_sizey)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: mpi_size
+    INTEGER, INTENT(OUT) :: mpi_sizex, mpi_sizey
+    INTEGER, DIMENSION(2,16), PARAMETER :: size_to_size2d_map = reshape( &
+        (/ 1,1 , 2,1 , 3,1  , 2,2 , &
+           5,1 , 3,2 , 7,1  , 4,2 , &
+           3,3 , 5,2 , 11,1 , 6,2 , &
+          13,1 , 7,2 , 5,3  , 4,4 /), (/ 2, 16 /) )
+    mpi_sizex = size_to_size2d_map(2,mpi_size)
+    mpi_sizey = size_to_size2d_map(1,mpi_size)
+END SUBROUTINE size_to_2Dsize
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task3/Makefile b/4-GPU/HandsOn/Solution/FORTRAN/task3/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..62dcaf89c710b1f2839a4088d87f7ef7df4f9311
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/FORTRAN/task3/Makefile
@@ -0,0 +1,54 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+COMPILER ?= PGI-tesla
+FC = mpifort
+ifeq ($(COMPILER),GCC)
+FCFLAGS = -freal-4-real-8 -DMPI_REAL_TYPE=MPI_REAL8
+else
+FCFLAGS = -r8 -DMPI_REAL_TYPE=MPI_REAL8
+endif
+ifeq ($(COMPILER),GCC)
+	FCFLAGS += -march=native -O3 -lm
+else ifeq ($(COMPILER),PGI-tesla)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=tesla:cc70,pinned
+else ifeq ($(COMPILER),PGI-multicore)
+	FCFLAGS += -Minfo=accel -fast -acc -ta=multicore
+endif
+PGPROF=pgprof -f --cpu-profiling off --annotate-mpi openmpi
+
+TASK=3
+NP ?= 6
+SC19_SUBMIT_CMD_GPU = ${SC19_SUBMIT_CMD} -a $(NP) -c ALL_CPUS -d cyclic -b packed:7 --smpiargs "-gpu"
+
+all: poisson2d
+
+poisson2d_serial.o: poisson2d_serial.F03 Makefile
+	$(FC) -c $(FCFLAGS) poisson2d_serial.F03 -o poisson2d_serial.o
+
+poisson2d: poisson2d.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.F03 poisson2d_serial.o -o poisson2d
+
+poisson2d.solution: poisson2d.solution.F03 poisson2d_serial.o Makefile
+	$(FC) $(FCFLAGS) poisson2d.solution.F03 poisson2d_serial.o -o poisson2d.solution
+
+clean:
+	rm -f poisson2d poisson2d.solution poisson2d*.o poisson2d.*.pgprof *.tar.gz
+
+run: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d
+
+#Disable CPU Profiling to minimize size of profiles
+profile: poisson2d
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.Task${TASK}.NP${NP}.?.pgprof .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.tar.gz poisson2d.Task${TASK}.NP${NP}.?.pgprof 
+
+solution: poisson2d.solution
+
+run.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ./poisson2d.solution
+
+#Disable CPU Profiling to minimize size of profiles
+profile.solution: poisson2d.solution
+	${SC19_SUBMIT_CMD_GPU} ${PGPROF} -o ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+	mv ${SC19_DIR_SCRATCH}/poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof  .
+	tar -cvzf pgprof.poisson2d.Task${TASK}.solution.tar.gz poisson2d.solution.Task${TASK}.NP${NP}.?.pgprof 
\ No newline at end of file
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task3/pgprof.poisson2d.Task3.solution.tar.gz b/4-GPU/HandsOn/Solution/FORTRAN/task3/pgprof.poisson2d.Task3.solution.tar.gz
index 42f0677908c71932313a0cd8f56769397e1797ec..d1fdfb24da31ebc5449d71280f2cabd621dc5f7a 100644
Binary files a/4-GPU/HandsOn/Solution/FORTRAN/task3/pgprof.poisson2d.Task3.solution.tar.gz and b/4-GPU/HandsOn/Solution/FORTRAN/task3/pgprof.poisson2d.Task3.solution.tar.gz differ
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task3/poisson2d.F03 b/4-GPU/HandsOn/Solution/FORTRAN/task3/poisson2d.F03
new file mode 100644
index 0000000000000000000000000000000000000000..d341dd7328c8f8cdcbe94c0f4e47b2cfaf3aef10
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/FORTRAN/task3/poisson2d.F03
@@ -0,0 +1,254 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+PROGRAM poisson2d
+#if _OPENACC
+    USE openacc
+#endif
+    USE mpi
+    IMPLICIT NONE
+    INTEGER, PARAMETER :: MAX_ITER_MAX_DIGITS = 512
+    INTEGER, PARAMETER :: nx = 4096
+    INTEGER, PARAMETER :: ny = 4096
+    REAL, PARAMETER :: tol = 1.0E-5
+    INTEGER :: i,ix, iy, ix_start, ix_end, iy_start, iy_end, iter, iter_max, mpi_rank, mpi_size, device_type, ngpus, devicenum, ierror
+    INTEGER :: chunk_size, right, left
+    REAL :: x,y, error, globalerror
+    REAL*8 :: runtime_serial, runtime, start, finish, mpi_time, mpi_start_time
+    LOGICAL, EXTERNAL :: check_results
+    LOGICAL :: errors
+    REAL, DIMENSION(:,:), ALLOCATABLE :: a, a_ref, a_new, rhs
+    CHARACTER(MAX_ITER_MAX_DIGITS) :: iter_max_arg
+    
+    call getarg(1, iter_max_arg)
+    
+    IF ( iter_max_arg == '' ) THEN
+        iter_max = 1000
+    ELSE
+        read (iter_max_arg, *) iter_max
+    ENDIF
+    
+    mpi_rank = 0
+    mpi_size = 1
+    
+    !Initialize MPI and determine rank and size
+    CALL MPI_Init(ierror)
+    CALL MPI_Comm_rank(MPI_COMM_WORLD,mpi_rank,ierror)
+    CALL MPI_Comm_size(MPI_COMM_WORLD,mpi_size,ierror)
+    
+#if _OPENACC
+    device_type = acc_get_device_type()
+    IF ( acc_device_nvidia == device_type ) THEN
+        ngpus=acc_get_num_devices( acc_device_nvidia )
+        !choose device to use by this rank
+        devicenum = MOD( mpi_rank, ngpus )
+        call acc_set_device_num( devicenum, acc_device_nvidia )
+    END IF
+    call acc_init( device_type )
+#endif
+    
+    ALLOCATE( a(nx,ny) )
+    ALLOCATE( a_ref(nx,ny) )
+    ALLOCATE( a_new(nx,ny) )
+    ALLOCATE( rhs(nx,ny) )
+    
+    a = 0.0
+    a_ref = 0.0
+    
+    DO iy = 2, ny-1
+        DO ix = 2, nx-1
+            x = -1.0 + (2.0*ix/(nx-1.0))
+            y = -1.0 + (2.0*iy/(ny-1.0))
+            rhs(ix,iy) = EXP(-10.0*(x*x+y*y))
+        END DO
+    END DO
+    
+    !$acc enter data create(a,a_ref,a_new,rhs)
+    
+    ix_start = 2
+    ix_end   = nx-1
+    
+    !set first and last row to be processed by this rank.
+    !Ensure correctness if ny%size != 0
+    chunk_size = CEILING( (1.0*ny)/mpi_size )
+    iy_start = mpi_rank * chunk_size
+    iy_end = iy_start + chunk_size - 1
+    
+    !Do not process boundaries
+    iy_start = MAX( iy_start, 2 )
+    iy_end = MIN( iy_end, ny-1 )
+    
+    !OpenACC Warm-up
+    !$acc parallel loop present(a,a_ref)
+    DO iy = 1, ny
+        DO ix = 1, nx
+            a(ix,iy) = 0.0
+            a_ref(ix,iy) = 0.0
+        END DO
+    END DO
+    
+    !MPI Warm-up to establish CUDA IPC connections
+    DO i = 1,2
+        left = mpi_rank-1
+        IF ( mpi_rank == 0 ) THEN
+            left = mpi_size-1
+        END IF
+        right = mpi_rank+1
+        IF ( mpi_rank == mpi_size-1 ) THEN
+            right = 0
+        END IF
+        !$acc host_data use_device( a )
+            !1. Sent column iy_start (first modified column) to left receive right boundary (iy_end+1) from right
+            CALL MPI_Sendrecv( a(ix_start,iy_start), (ix_end-ix_start)+1, MPI_REAL_TYPE, left   , 0, &
+                              a(ix_start,iy_end+1), (ix_end-ix_start)+1, MPI_REAL_TYPE, right, 0, &
+                              MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierror )
+
+            !2. Sent column iy_end (last modified column) to right receive left boundary (iy_start-1) from left
+            CALL MPI_Sendrecv( a(ix_start,iy_end), (ix_end-ix_start)+1, MPI_REAL_TYPE, right, 0, &
+                               a(ix_start,(iy_start-1)), (ix_end-ix_start)+1, MPI_REAL_TYPE, left   , 0, &
+                               MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierror )
+        !$acc end host_data
+    END DO
+    
+    !Wait for all processes to finish Warm-up
+    CALL MPI_Barrier( MPI_COMM_WORLD, ierror )
+
+    IF ( mpi_rank == 0 ) THEN
+        WRITE(*,"('Jacobi relaxation Calculation: ',I4,' x ',I4,' mesh')") nx,ny
+        WRITE(*,*) 'Calculate reference solution and time serial execution.'
+        !Timing of MPI rank 0 is used to calculate speedup do this in isolation
+        start = MPI_WTIME()
+        CALL poisson2d_serial( nx, ny, iter_max, mpi_rank, tol, a_ref, a_new, rhs )
+        finish = MPI_WTIME()
+        runtime_serial = finish-start
+    END IF
+    CALL MPI_Bcast(a_ref, size(a_ref), MPI_REAL_TYPE, 0, MPI_COMM_WORLD, ierror)
+    
+    !Wait for all processes to ensure correct timing of the parallel version
+    CALL MPI_Barrier( MPI_COMM_WORLD, ierror )
+    
+    IF ( mpi_rank == 0 ) THEN
+        WRITE(*,*) 'Parallel execution.'
+    END IF 
+    
+    mpi_time = 0.0
+    start = MPI_WTIME()
+    iter = 1
+    error = 1.0
+    !$acc update device(a(1:nx,iy_start:iy_end),rhs(1:nx,iy_start:iy_end))
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        !$acc parallel loop present(a,a_new,rhs)
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a(ix+1,iy) + a(ix-1,iy) + a(ix,iy-1) + a(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a(ix,iy) ) )
+            END DO
+        END DO
+        !$acc end parallel
+        !Calculate global error across all ranks
+        globalerror = 0.0
+        call MPI_Allreduce( error, globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD, ierror )
+        error = globalerror
+        
+        !TODO: Split into halo and bulk part
+        !$acc parallel loop present(a,a_new)
+        DO iy = iy_start, iy_end
+            DO ix = ix_start, ix_end
+                a(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        !$acc end parallel
+        !TODO: Start bulk part asynchronously
+        
+        !Handle periodic boundary conditions and halo exchange with MPI
+        left = mpi_rank-1
+        IF ( mpi_rank == 0 ) THEN
+            left = mpi_size-1
+        END IF
+        right = mpi_rank+1
+        IF ( mpi_rank == mpi_size-1 ) THEN
+            right = 0
+        END IF
+        
+        mpi_start_time = MPI_WTIME()
+        !$acc host_data use_device( a )
+            !1. Sent column iy_start (first modified column) to left receive right boundary (iy_end+1) from right
+            CALL MPI_Sendrecv( a(ix_start,iy_start), (ix_end-ix_start)+1, MPI_REAL_TYPE, left   , 0, &
+                               a(ix_start,iy_end+1), (ix_end-ix_start)+1, MPI_REAL_TYPE, right, 0, &
+                               MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierror )
+
+            !2. Sent column iy_end (last modified column) to right receive left boundary (iy_start-1) from left
+            CALL MPI_Sendrecv( a(ix_start,iy_end), (ix_end-ix_start)+1, MPI_REAL_TYPE, right, 0, &
+                               a(ix_start,(iy_start-1)), (ix_end-ix_start)+1, MPI_REAL_TYPE, left   , 0, &
+                               MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierror )
+        !$acc end host_data
+        mpi_time = (MPI_WTIME() - mpi_start_time) + mpi_time
+        !TODO: wait for bulk part
+        
+        !$acc parallel loop present(a)
+        DO iy = iy_start, iy_end
+            a(1,iy) = a(nx-1,iy)
+            a(nx,iy) = a(2,iy)
+        END DO
+        !$acc end parallel
+
+        IF ( mpi_rank == 0 .AND. ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+    !$acc update self(a(1:nx,iy_start:iy_end))
+    !Wait for all processes to ensure correct timing of the parallel version
+    CALL MPI_Barrier( MPI_COMM_WORLD, ierror )
+    finish = MPI_WTIME()
+    runtime = finish-start
+    
+    errors = .FALSE.
+    IF ( check_results( mpi_rank, ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref ) ) THEN
+        IF ( mpi_rank == 0 ) THEN
+            WRITE(*,*) 'Num GPUs: ', mpi_size
+            WRITE(*,"(I4,'x',I4,': 1 GPU: ',F8.4,' s ',I1,' GPUs: ',F8.4,' s, speedup: ',F8.2,' efficiency: ',F8.2)"), &
+                  nx,ny,runtime_serial,mpi_size,runtime,runtime_serial/runtime,runtime_serial/(mpi_size*runtime)*100
+            WRITE(*,"('MPI time: 'F8.4' s, inter GPU BW: 'F8.2' GiB/s')"), &
+                  mpi_time,(iter*4*(ix_end-ix_start)*SIZEOF(a(1,1)))/(1024*1024*1024*mpi_time)
+        END IF
+    ELSE
+        errors = .TRUE.
+    END IF
+    
+    !$acc exit data delete(a,a_ref,a_new,rhs)
+    CALL MPI_Finalize(ierror)
+    
+    DEALLOCATE( rhs )
+    DEALLOCATE( a_new )
+    DEALLOCATE( a_ref )
+    DEALLOCATE( a )
+    IF ( errors ) THEN
+        STOP -1
+    END IF
+END PROGRAM poisson2d
diff --git a/4-GPU/HandsOn/Solution/FORTRAN/task3/poisson2d_serial.F03 b/4-GPU/HandsOn/Solution/FORTRAN/task3/poisson2d_serial.F03
new file mode 100644
index 0000000000000000000000000000000000000000..8a6e0a9f25deb2e6a615e3e1ba214f48d93a4ac1
--- /dev/null
+++ b/4-GPU/HandsOn/Solution/FORTRAN/task3/poisson2d_serial.F03
@@ -0,0 +1,126 @@
+! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+!
+! Redistribution and use in source and binary forms, with or without
+! modification, are permitted provided that the following conditions
+! are met:
+!  * Redistributions of source code must retain the above copyright
+!    notice, this list of conditions and the following disclaimer.
+!  * Redistributions in binary form must reproduce the above copyright
+!    notice, this list of conditions and the following disclaimer in the
+!    documentation and/or other materials provided with the distribution.
+!  * Neither the name of NVIDIA CORPORATION nor the names of its
+!    contributors may be used to endorse or promote products derived
+!    from this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+! EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+! PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+! CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+! EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+! PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+! PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+! OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+SUBROUTINE poisson2d_serial(nx, ny, iter_max, mpi_rank,tol,a_ref, a_new,rhs)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: nx, ny, iter_max, mpi_rank
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(INOUT) :: a_ref, a_new
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: rhs
+    INTEGER :: ix,iy, iter
+    REAL :: error
+    
+    iter = 1
+    error = 1.0
+    !$acc data present(a_ref,rhs,a_new)
+    !$acc update device(a_ref,rhs)
+    DO WHILE ( error > tol .AND. iter <= iter_max )
+        error = 0.0
+        !$acc parallel loop
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_new(ix,iy) = -0.25 * (rhs(ix,iy) - ( a_ref(ix+1,iy) + a_ref(ix-1,iy) + a_ref(ix,iy-1) + a_ref(ix,iy+1) ))
+                error = MAX( error, ABS( a_new(ix,iy) - a_ref(ix,iy) ) )
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO iy = 2, ny-1
+            DO ix = 2, nx-1
+                a_ref(ix,iy) = a_new(ix,iy)
+            END DO
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO ix = 2, nx-1
+            a_ref(ix,1) = a_ref(ix,ny-1)
+            a_ref(ix,ny) = a_ref(ix,2)
+        END DO
+        !$acc end parallel
+        
+        !$acc parallel loop
+        DO iy = 2, ny-1
+            a_ref(1,iy) = a_ref(nx-1,iy)
+            a_ref(nx,iy) = a_ref(2,iy)
+        END DO
+        !$acc end parallel
+        
+        IF ( mpi_rank == 0 .AND. ( iter == 1 .OR. MOD( iter, 100 ) == 0 ) ) THEN
+            WRITE(*,"('  ',I4,' ',F8.6)") iter, error
+        END IF
+        
+        iter = iter+1
+    END DO
+    !$acc update self(a_ref)
+    !$acc end data
+END SUBROUTINE poisson2d_serial
+
+LOGICAL FUNCTION check_results( mpi_rank, ix_start, ix_end, iy_start, iy_end, nx, ny, tol, a, a_ref )
+    USE mpi
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: mpi_rank, ix_start, ix_end, iy_start, iy_end, nx, ny
+    REAL, INTENT(IN) :: tol
+    REAL, DIMENSION(nx,ny), INTENT(IN) :: a, a_ref
+    INTEGER :: ix,iy,ierror
+    LOGICAL :: no_errors, global_no_errors, mpi_is_initialized
+    no_errors = .TRUE.
+    iy = iy_start
+    ix = ix_start
+    DO WHILE ( iy <= iy_end .AND. no_errors )
+        DO WHILE ( ix <= ix_end .AND. no_errors )
+            IF ( ABS( a_ref(ix,iy) - a(ix,iy)) >= tol ) THEN
+                WRITE(*,"('[MPI',I1,'] ERROR: a(',I4,',',I4,') = ',F8.6,' does not match ',F8.6,' (reference)')") &
+                    mpi_rank,ix,iy,a(ix,iy),a_ref(ix,iy)
+                no_errors = .FALSE.
+            END IF
+            ix = ix + 1
+        END DO
+        iy = iy + 1
+    END DO
+    
+    CALL MPI_Initialized(mpi_is_initialized, ierror)
+    IF ( mpi_is_initialized ) THEN
+        global_no_errors = .FALSE.
+        CALL MPI_ALLREDUCE(no_errors, global_no_errors, 1, MPI_LOGICAL, MPI_LAND, MPI_COMM_WORLD, ierror)
+        no_errors = global_no_errors
+    END IF
+    check_results = no_errors
+END FUNCTION check_results
+
+SUBROUTINE size_to_2Dsize(mpi_size, mpi_sizex, mpi_sizey)
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: mpi_size
+    INTEGER, INTENT(OUT) :: mpi_sizex, mpi_sizey
+    INTEGER, DIMENSION(2,16), PARAMETER :: size_to_size2d_map = reshape( &
+        (/ 1,1 , 2,1 , 3,1  , 2,2 , &
+           5,1 , 3,2 , 7,1  , 4,2 , &
+           3,3 , 5,2 , 11,1 , 6,2 , &
+          13,1 , 7,2 , 5,3  , 4,4 /), (/ 2, 16 /) )
+    mpi_sizex = size_to_size2d_map(2,mpi_size)
+    mpi_sizey = size_to_size2d_map(1,mpi_size)
+END SUBROUTINE size_to_2Dsize
diff --git a/4-GPU/HandsOn/Solution/HandsOnGPUProgramming_Solution.html b/4-GPU/HandsOn/Solution/HandsOnGPUProgramming_Solution.html
index cb32d34bfe7140d3782228eebf2e6c73abb53217..151345c8a58f2f77b345ab5cf987bd7d4b75d779 100644
--- a/4-GPU/HandsOn/Solution/HandsOnGPUProgramming_Solution.html
+++ b/4-GPU/HandsOn/Solution/HandsOnGPUProgramming_Solution.html
@@ -13017,45 +13017,6 @@ ul.typeahead-list  > li > a.pull-right {
 .highlight .vm { color: #19177C } /* Name.Variable.Magic */
 .highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
     </style>
-<style type="text/css">
-    
-/* Temporary definitions which will become obsolete with Notebook release 5.0 */
-.ansi-black-fg { color: #3E424D; }
-.ansi-black-bg { background-color: #3E424D; }
-.ansi-black-intense-fg { color: #282C36; }
-.ansi-black-intense-bg { background-color: #282C36; }
-.ansi-red-fg { color: #E75C58; }
-.ansi-red-bg { background-color: #E75C58; }
-.ansi-red-intense-fg { color: #B22B31; }
-.ansi-red-intense-bg { background-color: #B22B31; }
-.ansi-green-fg { color: #00A250; }
-.ansi-green-bg { background-color: #00A250; }
-.ansi-green-intense-fg { color: #007427; }
-.ansi-green-intense-bg { background-color: #007427; }
-.ansi-yellow-fg { color: #DDB62B; }
-.ansi-yellow-bg { background-color: #DDB62B; }
-.ansi-yellow-intense-fg { color: #B27D12; }
-.ansi-yellow-intense-bg { background-color: #B27D12; }
-.ansi-blue-fg { color: #208FFB; }
-.ansi-blue-bg { background-color: #208FFB; }
-.ansi-blue-intense-fg { color: #0065CA; }
-.ansi-blue-intense-bg { background-color: #0065CA; }
-.ansi-magenta-fg { color: #D160C4; }
-.ansi-magenta-bg { background-color: #D160C4; }
-.ansi-magenta-intense-fg { color: #A03196; }
-.ansi-magenta-intense-bg { background-color: #A03196; }
-.ansi-cyan-fg { color: #60C6C8; }
-.ansi-cyan-bg { background-color: #60C6C8; }
-.ansi-cyan-intense-fg { color: #258F8F; }
-.ansi-cyan-intense-bg { background-color: #258F8F; }
-.ansi-white-fg { color: #C5C1B4; }
-.ansi-white-bg { background-color: #C5C1B4; }
-.ansi-white-intense-fg { color: #A1A6B2; }
-.ansi-white-intense-bg { background-color: #A1A6B2; }
-
-.ansi-bold { font-weight: bold; }
-
-    </style>
 
 
 <style type="text/css">
@@ -13089,7 +13050,7 @@ div#notebook {
 
 <!-- Loading mathjax macro -->
 <!-- Load mathjax -->
-    <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS_HTML"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS_HTML"></script>
     <!-- MathJax configuration -->
     <script type="text/x-mathjax-config">
     MathJax.Hub.Config({
@@ -13116,7 +13077,7 @@ div#notebook {
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h1 id="Solutions:-Hands-On-GPU-Programming">Solutions: Hands-On GPU Programming<a class="anchor-link" href="#Solutions:-Hands-On-GPU-Programming">&#182;</a></h1><p><em>Supercomputing 2018 Tutorial "Application Porting and Optimization on GPU-Accelerated POWER Architectures", November 12th 2018</em></p>
+<h1 id="Hands-On-GPU-Programming">Hands-On GPU Programming<a class="anchor-link" href="#Hands-On-GPU-Programming">&#182;</a></h1><p><em>Supercomputing 2019 Tutorial "Application Porting and Optimization on GPU-Accelerated POWER Architectures", November 18th 2019</em></p>
 <hr>
 
 </div>
@@ -13125,29 +13086,32 @@ div#notebook {
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h3 id="Read-me-first">Read me first<a class="anchor-link" href="#Read-me-first">&#182;</a></h3><p><strong>This contains the output for the solutions. It is for illustrative purpose only and not suitable for execution.</strong>   The solutions are described in the solution section. If you run this as a jupyter notebook from the <code>Solutions</code> directory links to the solution source files and solution profiles should work. For the <em>html</em> and <em>pdf</em> versions please navigate to the corresponding directory to find the solution profiles and sources.</p>
-<p>Skip ahead to the <a href="#solutions">Solutions</a></p>
-<hr>
-<hr>
-<p>This tutorial is primarily designed to be executed as a <em>jupyter</em> notebook. However, everything can also be done using an <em>ssh</em> connection to <em>ascent.olcf.ornl.gov</em> in your terminal.</p>
-<h4 id="Jupyter-notebook-execution">Jupyter notebook execution<a class="anchor-link" href="#Jupyter-notebook-execution">&#182;</a></h4><p>When using jupyter this notebook will guide you through the step. Note that if you execute a cell multiple times while optimizing the code the output will be replaced. You can however duplicate the cell you want to execute and keep its output. Check the <em>edit</em> menu above.</p>
-<p>You will always find links to a file browser of the corresponding task subdirectory as well as direct links to the source files you will need to edit as well as the profiling output you need to open locally.</p>
-<p>If you want you also can get a <a href="/terminals/4">terminal</a> in your browser.</p>
-<h4 id="Terminal-fallback">Terminal fallback<a class="anchor-link" href="#Terminal-fallback">&#182;</a></h4><p>The tasks are placed in directories named <code>[C/FORTRAN]/task[0-3]</code>.</p>
-<p>The files you will need to edit are always the <code>poisson2d.(C|F03)</code> files.</p>
-<p>The makefile targets execute everything to compile, run and profile the code. Please take a look at the cells containing the make calls as a guide.</p>
-<p>The outputs of profiling runs be placed in the working directory of the current task and are named like <code>*.pgprof</code> or <code>pgprof.*.tar.gz</code> in case of multiple files. You can use <em>scp/sftp</em> to transfer files to your machine and for viewing them in pgprof/nvprof.</p>
-<h4 id="Viewing-profiles-in-the-NVIDIA-Visual-Profiler-/-PGI-Profiler">Viewing profiles in the NVIDIA Visual Profiler / PGI Profiler<a class="anchor-link" href="#Viewing-profiles-in-the-NVIDIA-Visual-Profiler-/-PGI-Profiler">&#182;</a></h4><p>The profiles generated <em>pgprof / nvprof</em> should be viewed on your local machine. You can install the PGI Community Edition (pgprof) or the NVIDIA CUDA Toolkit on your notebook (Windows, Mac, Linux). You don't need an NVIDIA GPU in your machine to use the profiler GUI.</p>
-<p>There are USB Sticks in the room that contain the installers for various platforms, but for reference you can also download it from:</p>
+<h2 id="Solutions">Solutions<a class="anchor-link" href="#Solutions">&#182;</a></h2><p><strong>This contains the output for the solutions.</strong></p>
+<p>The solutions are described in the solution section. The directory links to the solution source files should work though. For the <em>html</em> and <em>pdf</em> versions please navigate to the corresponding directory to find the solution profiles and sources.</p>
+<h3 id="GPU-Programming">GPU Programming<a class="anchor-link" href="#GPU-Programming">&#182;</a></h3><ul>
+<li><a href="#solution0">Solution 0</a> Accelerate a CPU Jacobi solver with OpenACC relying on Unified Memory for data movement using <code>–ta=tesla:managed</code>  </li>
+</ul>
+<ul>
+<li><a href="#solution1">Solution 1</a> Fix memory access pattern of OpenACC accelerated Jacobi Solver  </li>
+</ul>
+<h3 id="Multi-GPU-with-MPI">Multi-GPU with MPI<a class="anchor-link" href="#Multi-GPU-with-MPI">&#182;</a></h3><ul>
+<li><a href="#solution2">Solution 2</a> Use MPI to make OpenACC accelerated Jacobi Solver scale to multiple GPUs  </li>
+</ul>
 <ul>
-<li><a href="https://developer.nvidia.com/cuda-downloads">NVIDIA CUDA Toolkit</a></li>
-<li><a href="https://www.pgroup.com/products/community.htm">PGI Community Edition</a></li>
+<li><a href="#solution3">Solution 3</a> Hide MPI communication time by overlapping communication and 
+  computation in a MPI+OpenACC multi GPU Jacobi Solver  </li>
+</ul>
+<h3 id="Multi-GPU-with-NVSHMEM-(Advanced----C-only)">Multi-GPU with NVSHMEM <em>(Advanced -- C only)</em><a class="anchor-link" href="#Multi-GPU-with-NVSHMEM-(Advanced----C-only)">&#182;</a></h3><ul>
+<li><a href="#solution4">Solution 4</a> Use NVSHMEM instead of MPI  </li>
 </ul>
-<p>After downloading the profiler output (more infos below) follow the steps outlined in:</p>
 <ul>
-<li><a href="https://docs.nvidia.com/cuda/profiler-users-guide/index.html#import-session">Import Session</a></li>
+<li><a href="#solution5">Solution 5</a> Put NVSHMEM calls on stream to hide API calls and GPU/CPU synchronization  </li>
+</ul>
+<h3 id="Survey">Survey<a class="anchor-link" href="#Survey">&#182;</a></h3><ul>
+<li><a href="#survey">Suvery</a> Please remember to take the survey !</li>
 </ul>
-<p>In case there is confusion: The PGI Profiler is a slightly modified version (different default settings) of the NVIDIA Visual Profiler. So you can use any of the two to view profiles.</p>
+<hr>
+<hr>
 
 </div>
 </div>
@@ -13178,6 +13142,7 @@ div#notebook {
 <span class="k">if</span><span class="p">(</span><span class="ow">not</span> <span class="n">rootdir</span><span class="p">):</span>
     <span class="n">rootdir</span><span class="o">=%</span><span class="k">pwd</span>
 <span class="n">basedir</span><span class="o">=</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">rootdir</span><span class="p">,</span><span class="n">LANGUAGE</span><span class="p">)</span>
+<span class="n">basedirC</span><span class="o">=</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">rootdir</span><span class="p">,</span><span class="s1">&#39;C&#39;</span><span class="p">)</span>
 
 <span class="nb">print</span> <span class="p">(</span><span class="s2">&quot;You selected </span><span class="si">{}</span><span class="s2"> for the exercises.&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">LANGUAGE</span><span class="p">))</span>
 
@@ -13191,6 +13156,8 @@ div#notebook {
         <span class="n">d</span><span class="o">=</span><span class="s1">&#39;</span><span class="si">%s</span><span class="s1">/task</span><span class="si">%i</span><span class="s1">&#39;</span><span class="o">%</span><span class="p">(</span><span class="n">basedir</span><span class="p">,</span><span class="n">t</span><span class="p">)</span>
         <span class="o">%</span><span class="k">cd</span> $d
         <span class="o">!</span>make clean
+        
+<span class="c1">#cleanall()</span>
 </pre></div>
 
     </div>
@@ -13215,41 +13182,43 @@ div#notebook {
 </div>
 </div>
 
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[2]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task0
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task0
+</pre>
+</div>
+</div>
+
+</div>
+</div>
+
 </div>
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h1 id="Tasks">Tasks<a name="top" /><a class="anchor-link" href="#Tasks">&#182;</a></h1><p>This session comes with multiple tasks. All tasks are available in C or FORTRAN and can be found in the <code>[C|Fortan]/task[0-3]</code> subdirectories. There you will also find Makefiles that are set up so that you can compile and submit all necessary tasks.</p>
-<p>Please choose from the task below.</p>
-<ul>
-<li><p><a href="#task0">Task 0</a> Accelerate a CPU Jacobi solver with OpenACC relying on Unified Memory for data movement using <code>–ta=tesla:managed</code><br>
-<a href="#solution0">Solution 0</a></p>
-</li>
-<li><p><a href="#task1">Task 1</a> Fix memory access pattern of OpenACC accelerated Jacobi Solver<br>
-<a href="#solution1">Solution 1</a></p>
-</li>
-<li><p><a href="#task2">Task 2</a> Use MPI to make OpenACC accelerated Jacobi Solver scale to multiple GPUs<br>
-<a href="#solution2">Solution 2</a></p>
-</li>
-<li><p><a href="#task3">Task 3</a> Hide MPI communication time by overlapping communication and 
-  computation in a MPI+OpenACC multi GPU Jacobi Solver<br>
-<a href="#solution3">Solution 3</a></p>
-</li>
-</ul>
-<ul>
-<li><a href="#survey">Suvery</a> Please remember to take the survey !</li>
-</ul>
-<h3 id="Make-Targets-">Make Targets <a name="make" /><a class="anchor-link" href="#Make-Targets-">&#182;</a></h3><p>For all tasks we have defined the following make targets.</p>
-<ul>
-<li><strong>run</strong>:<br>
- run <code>poisson2d</code></li>
-<li><strong>poisson2d</strong>:<br>
-build <code>poisson2d</code> binary (default)</li>
-<li><strong>profile</strong>:<br>
-profile with <code>pgprof</code></li>
-<li><strong>*.solution</strong>:<br>
-same as above for the solution (e.g. <code>make poisson2d.solution</code> or <code>make run.solution</code>)</li>
-</ul>
+<hr>
+<hr>
 
 </div>
 </div>
@@ -13257,6 +13226,7 @@ same as above for the solution (e.g. <code>make poisson2d.solution</code> or <co
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
+<h1 id="Solutions">Solutions<a name="solutions" /><a class="anchor-link" href="#Solutions">&#182;</a></h1><p>Below are suggested solutions. This is only a short description of the solution, but the <code>poisson2d.solution.(c|F03)</code> files linked below have the full source code. If you want to run / profile the solutions feel free to duplicate the cells for the tasks and change the <a href="#make">make target</a> to the <code>*.solution</code> ones.</p>
 <p><a href="#top">Back to Top</a></p>
 <hr>
 
@@ -13266,32 +13236,37 @@ same as above for the solution (e.g. <code>make poisson2d.solution</code> or <co
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h2 id="Task-0:-Using-OpenACC">Task 0: <a name="task0" />Using OpenACC<a class="anchor-link" href="#Task-0:-Using-OpenACC">&#182;</a></h2><h3 id="Description">Description<a class="anchor-link" href="#Description">&#182;</a></h3><p>The goal of this task is to accelerate a CPU Jacobi solver with OpenACC relying on Unified Memory for data movement using <code>–ta=tesla:managed</code>.</p>
-<p>Your task is to:</p>
-<ul>
-<li>Parallelize Loops with OpenACC parallel loop</li>
-</ul>
-<p><em>Look for</em> <strong>TODOs</strong> in the code.</p>
-<p>Look at the output generated by the PGI compiler (enabled by the <code>-Minfo=accel</code> option) to see how the compiler parallelizes the code.</p>
-<h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><p>You can open the source code either in a terminal in an editor. Navigate to <code>(C|Fortran)/task0/</code> and open <code>poisson2d.c</code> in a editor of your choice.</p>
-<p>If your are using the jupyter approach by following the link (for the language of your choice), This will open the source code in an editor in a new browser tab/window.</p>
-<ul>
-<li><a href="/edit/C/task0/poisson2d.c">C Version</a></li>
-<li><a href="/edit/FORTAN/task0/poisson2d.F03">Fortran Version</a></li>
-</ul>
-<h4 id="File-browser">File browser<a class="anchor-link" href="#File-browser">&#182;</a></h4><p>Can be used to open source files, Makefiles, profiling output.</p>
-<ul>
-<li><a href="/tree/C/task0/">C Version</a></li>
-<li><a href="/tree/FORTRAN/task0/">Fortran Version</a></li>
+<h2 id="Solution-0:">Solution 0:<a name="solution0" /><a class="anchor-link" href="#Solution-0:">&#182;</a></h2><div class="highlight"><pre><span></span><span class="cp">#pragma acc parallel loop</span>
+<span class="k">for</span> <span class="p">(</span><span class="kt">int</span> <span class="n">ix</span> <span class="o">=</span> <span class="n">ix_start</span><span class="p">;</span> <span class="n">ix</span> <span class="o">&lt;</span> <span class="n">ix_end</span><span class="p">;</span> <span class="n">ix</span><span class="o">++</span><span class="p">)</span>
+<span class="p">{</span>
+    <span class="cp">#pragma acc loop</span>
+    <span class="k">for</span><span class="p">(</span> <span class="kt">int</span> <span class="n">iy</span> <span class="o">=</span> <span class="n">iy_start</span><span class="p">;</span> <span class="n">iy</span> <span class="o">&lt;</span> <span class="n">iy_end</span><span class="p">;</span> <span class="n">iy</span><span class="o">++</span> <span class="p">)</span>
+    <span class="p">{</span>
+        <span class="n">Anew</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">=</span> <span class="o">-</span><span class="mf">0.25</span> <span class="o">*</span> <span class="p">(</span><span class="n">rhs</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">-</span> <span class="p">(</span> <span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="o">+</span><span class="mi">1</span><span class="p">]</span> <span class="o">+</span> <span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
+                                               <span class="o">+</span> <span class="n">A</span><span class="p">[(</span><span class="n">iy</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">+</span> <span class="n">A</span><span class="p">[(</span><span class="n">iy</span><span class="o">+</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="p">));</span>
+        <span class="n">error</span> <span class="o">=</span> <span class="n">fmaxr</span><span class="p">(</span> <span class="n">error</span><span class="p">,</span> <span class="n">fabsr</span><span class="p">(</span><span class="n">Anew</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span><span class="o">-</span><span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]));</span>
+    <span class="p">}</span>
+<span class="p">}</span>
+</pre></div>
+<h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
+<li><a href="/C/task0/poisson2d.solution.c?edit=1">C Version</a></li>
+<li><a href="/edit/./FORTRAN/task0/poisson2d.solution.F03">Fortran Version</a></li>
 </ul>
-<p><strong>Before</strong> executing any of the cells below first execute the next cell to change to the right directory.</p>
+
+</div>
+</div>
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Compiling,-Running-and-Profiling">Compiling, Running and Profiling<a class="anchor-link" href="#Compiling,-Running-and-Profiling">&#182;</a></h4><p>You can compile, run and profile the solution with the next cells.  <strong>After</strong> the profiling finished the output file <code>poisson2d.solution.pgprof</code>  can be downloaded from here: <a href="/tree/./C/task0/poisson2d.solution.pgprof?download=1">C Version</a> / <a href="./FORTRAN/task0/poisson2d.solution.pgprof?download=1">Fortran Version</a>.</p>
 
 </div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="prompt input_prompt">In&nbsp;[3]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task0
@@ -13301,77 +13276,177 @@ same as above for the solution (e.g. <code>make poisson2d.solution</code> or <co
 </div>
 </div>
 
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task0
+</pre>
+</div>
 </div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Compilation-and-Execution">Compilation and Execution<a class="anchor-link" href="#Compilation-and-Execution">&#182;</a></h4><p>If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call <code>make</code> with the desired <a href="#make">target</a>.
-Alternatively you can just navigate to the right directory and execute <code>make &lt;target&gt;</code> in your terminal.</p>
 
 </div>
 </div>
+
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="prompt input_prompt">In&nbsp;[4]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task0&#39;</span><span class="p">)</span>
-<span class="o">!</span>make
+<span class="o">!</span>make poisson2d.solution
 </pre></div>
 
     </div>
 </div>
 </div>
 
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>pgcc -c -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,managed poisson2d_serial.c -o poisson2d_serial.o
+pgcc -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,managed poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution
+poisson2d.solution.c:
+main:
+     66, Generating Tesla code
+         67, #pragma acc loop gang /* blockIdx.x */
+         68, #pragma acc loop vector(128) /* threadIdx.x */
+     66, Generating implicit copyout(A[:])
+     68, Loop is parallelizable
+     88, Generating Tesla code
+         89, #pragma acc loop gang /* blockIdx.x */
+         90, #pragma acc loop vector(128) /* threadIdx.x */
+         94, Generating implicit reduction(max:error)
+     88, Generating implicit copyin(A[:],rhs[:])
+         Generating implicit copyout(Anew[:])
+     90, Loop is parallelizable
+     98, Generating Tesla code
+         99, #pragma acc loop gang /* blockIdx.x */
+        100, #pragma acc loop vector(128) /* threadIdx.x */
+     98, Generating implicit copyin(Anew[:])
+         Generating implicit copyout(A[:])
+    100, Loop is parallelizable
+    106, Generating Tesla code
+        107, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+    106, Generating implicit copyin(A[:])
+         Generating implicit copyout(A[nx*(ny-1)+1:2046])
+    111, Generating Tesla code
+        112, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+    111, Generating implicit copy(A[:])
+</pre>
+</div>
+</div>
+
+</div>
+</div>
+
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="prompt input_prompt">In&nbsp;[5]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task0&#39;</span><span class="p">)</span>
-<span class="o">!</span>make run
+<span class="o">!</span>make run.solution
 </pre></div>
 
     </div>
 </div>
 </div>
 
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./poisson2d.solution
+Job &lt;25189&gt; is submitted to default queue &lt;batch&gt;.
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+Jacobi relaxation Calculation: 2048 x 2048 mesh
+Calculate reference solution and time serial CPU execution.
+    0, 0.249999
+  100, 0.249760
+  200, 0.249522
+  300, 0.249285
+  400, 0.249048
+GPU execution.
+    0, 0.249999
+  100, 0.249760
+  200, 0.249522
+  300, 0.249285
+  400, 0.249048
+2048x2048: 1 CPU:   5.4684 s, 1 GPU:   0.1884 s, speedup:    29.02
+</pre>
+</div>
 </div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling finished the output file <code>poisson2d.pgprof</code>  can be downloaded from here: <a href="/tree/C/task0/poisson2d.pgprof?download=1">C Version</a> / <a href="/tree/FORTRAN/task0/poisson2d.pgprof?download=1">Fortran Version</a>.
-Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu.</p>
 
 </div>
 </div>
+
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="prompt input_prompt">In&nbsp;[6]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task0&#39;</span><span class="p">)</span>
-<span class="o">!</span>make profile
+<span class="o">!</span>make profile.solution
 </pre></div>
 
     </div>
 </div>
 </div>
 
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off --openmp-profiling off  -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.pgprof ./poisson2d.solution 10
+Job &lt;25190&gt; is submitted to default queue &lt;batch&gt;.
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+==91820== PGPROF is profiling process 91820, command: ./poisson2d.solution 10
+==91820== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.pgprof
+Jacobi relaxation Calculation: 2048 x 2048 mesh
+Calculate reference solution and time serial CPU execution.
+    0, 0.249999
+GPU execution.
+    0, 0.249999
+2048x2048: 1 CPU:   0.1230 s, 1 GPU:   0.0189 s, speedup:     6.51
+mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.pgprof .
+</pre>
+</div>
 </div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="References">References<a class="anchor-link" href="#References">&#182;</a></h4><ol>
-<li><a href="http://www.openacc.org">http://www.openacc.org</a></li>
-<li><a href="https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf">OpenACC Reference Card</a></li>
-</ol>
 
 </div>
 </div>
+
 </div>
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
@@ -13385,233 +13460,450 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h2 id="Task-1:-Memory-Access-Patterns">Task 1:<a name="task1" /> Memory Access Patterns<a class="anchor-link" href="#Task-1:-Memory-Access-Patterns">&#182;</a></h2><h3 id="Description">Description<a class="anchor-link" href="#Description">&#182;</a></h3><p>The goal of this task is to fix the memory access pattern of OpenACC accelerated Jacobi Solver. Generate the profile, download the generated profiles and import them into pgprof / nvprof.
-There use “Global Memory Access Pattern” experiment to analyze the issue.</p>
-<p><em>Look for</em> <strong>TODOs</strong> in the code.</p>
+<h2 id="Solution-1:">Solution 1:<a name="solution1" /><a class="anchor-link" href="#Solution-1:">&#182;</a></h2><p>Swap the <code>ix</code> and <code>iy</code> loops to make sure that <code>ix</code> is the fastest running index</p>
+<div class="highlight"><pre><span></span><span class="cp">#pragma acc parallel loop</span>
+<span class="k">for</span> <span class="p">(</span><span class="kt">int</span> <span class="n">iy</span> <span class="o">=</span> <span class="n">iy_start</span><span class="p">;</span> <span class="n">iy</span> <span class="o">&lt;</span> <span class="n">iy_end</span><span class="p">;</span> <span class="n">iy</span><span class="o">++</span><span class="p">)</span>
+<span class="p">{</span>
+    <span class="k">for</span><span class="p">(</span> <span class="kt">int</span> <span class="n">ix</span> <span class="o">=</span> <span class="n">ix_start</span><span class="p">;</span> <span class="n">ix</span> <span class="o">&lt;</span> <span class="n">ix_end</span><span class="p">;</span> <span class="n">ix</span><span class="o">++</span> <span class="p">)</span>
+    <span class="p">{</span>
+        <span class="n">Anew</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">=</span> <span class="o">-</span><span class="mf">0.25</span> <span class="o">*</span> <span class="p">(</span><span class="n">rhs</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">-</span> <span class="p">(</span> <span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="o">+</span><span class="mi">1</span><span class="p">]</span> <span class="o">+</span> <span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
+                                               <span class="o">+</span> <span class="n">A</span><span class="p">[(</span><span class="n">iy</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">+</span> <span class="n">A</span><span class="p">[(</span><span class="n">iy</span><span class="o">+</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="p">));</span>
+        <span class="n">error</span> <span class="o">=</span> <span class="n">fmaxr</span><span class="p">(</span> <span class="n">error</span><span class="p">,</span> <span class="n">fabsr</span><span class="p">(</span><span class="n">Anew</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span><span class="o">-</span><span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]));</span>
+    <span class="p">}</span>
+<span class="p">}</span>
+</pre></div>
 <h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
-<li><a href="/edit/C/task1/poisson2d.c">C Version</a></li>
-<li><a href="/edit/FORTRAN/task1/poisson2d.F03">Fortran Version</a></li>
-</ul>
-<h4 id="Directory-browser">Directory browser<a class="anchor-link" href="#Directory-browser">&#182;</a></h4><p>Can be used to open source files, Makefiles, profiling output.</p>
-<ul>
-<li><a href="/tree/C/task1/">C Version</a></li>
-<li><a href="/tree/FORTRAN/task1/">Fortran Version</a></li>
+<li><a href="/edit/C/task1/poisson2d.solution.c">C Version</a></li>
+<li><a href="/edit/FORTRAN/task1/poisson2d.solution.F03">Fortran Version</a></li>
 </ul>
-<p><strong>Before</strong> executing any of the cells below first execute the next cell to change to the right directory.</p>
-
-</div>
-</div>
-</div>
-<div class="cell border-box-sizing code_cell rendered">
-<div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
-<div class="inner_cell">
-    <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task1
-</pre></div>
 
-    </div>
 </div>
 </div>
-
 </div>
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Compilation-and-Execution">Compilation and Execution<a class="anchor-link" href="#Compilation-and-Execution">&#182;</a></h4><p>If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call <code>make</code> with the desired <a href="#make">target</a>.
-Alternatively you can just navigate to the right directory and execute <code>make &lt;target&gt;</code> in your terminal.</p>
+<h4 id="Compiling,-Running-and-Profiling">Compiling, Running and Profiling<a class="anchor-link" href="#Compiling,-Running-and-Profiling">&#182;</a></h4><p>You can compile, run and profile the solution with the next cells.  <strong>After</strong> the profiling finished the output file <code>poisson2d.solution.pgprof</code>  can be downloaded from here: <a href="/tree/C/task1/pgprof.poisson2d.Task1.solution.tar.gz?download=1">C Version</a> / <a href="/tree/FORTRAN/task1/pgprof.poisson2d.Task1.solution.tar.gz?download=1">Fortran Version</a>.</p>
 
 </div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="prompt input_prompt">In&nbsp;[7]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task1&#39;</span><span class="p">)</span>
-<span class="o">!</span>make
+<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task1
 </pre></div>
 
     </div>
 </div>
 </div>
 
-</div>
-<div class="cell border-box-sizing code_cell rendered">
-<div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
-<div class="inner_cell">
-    <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task1&#39;</span><span class="p">)</span>
-<span class="o">!</span>make run
-</pre></div>
+<div class="output_wrapper">
+<div class="output">
 
-    </div>
-</div>
-</div>
 
-</div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling finished the output files can be downloaded from here: <a href="/tree/C/task1/pgprof.poisson2d.Task1.tar.gz?download=1">C Version</a> / <a href="/tree/FORTRAN/task1/pgprof.Task1.poisson2d.tar.gz?download=1">Fortran Version</a>.
-Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu.</p>
+<div class="output_area">
 
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task1
+</pre>
 </div>
 </div>
-</div>
-<div class="cell border-box-sizing code_cell rendered">
-<div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
-<div class="inner_cell">
-    <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="o">!</span>ls
-</pre></div>
 
-    </div>
 </div>
 </div>
 
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="prompt input_prompt">In&nbsp;[8]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task1&#39;</span><span class="p">)</span>
-<span class="o">!</span>make profile
+<span class="o">!</span>make poisson2d.solution
 </pre></div>
 
     </div>
 </div>
 </div>
 
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>pgcc -c -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,managed,lineinfo poisson2d_serial.c -o poisson2d_serial.o
+pgcc -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,managed,lineinfo poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution
+poisson2d.solution.c:
+main:
+     66, Generating Tesla code
+         67, #pragma acc loop gang /* blockIdx.x */
+         68, #pragma acc loop vector(128) /* threadIdx.x */
+     66, Generating implicit copyout(A[:])
+     68, Loop is parallelizable
+     88, Generating Tesla code
+         89, #pragma acc loop gang /* blockIdx.x */
+         90, #pragma acc loop vector(128) /* threadIdx.x */
+         94, Generating implicit reduction(max:error)
+     88, Generating implicit copyin(A[:],rhs[:])
+         Generating implicit copyout(Anew[:])
+     90, Loop is parallelizable
+     98, Generating Tesla code
+         99, #pragma acc loop gang /* blockIdx.x */
+        100, #pragma acc loop vector(128) /* threadIdx.x */
+     98, Generating implicit copyin(Anew[:])
+         Generating implicit copyout(A[:])
+    100, Loop is parallelizable
+    106, Generating Tesla code
+        107, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+    106, Generating implicit copyin(A[:])
+         Generating implicit copyout(A[nx*(ny-1)+1:2046])
+    111, Generating Tesla code
+        112, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+    111, Generating implicit copy(A[:])
+</pre>
+</div>
 </div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<p>For the <em>Global Memory Load/Store Efficiency</em> the <code>make profile</code> command also generated a CSV file that you can import and view with the cell below.<br>
-If you purely work in a terminal you can view the same output by running <code>pgprof -i poisson2d.efficiency.pgprof</code>.</p>
 
 </div>
 </div>
+
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="prompt input_prompt">In&nbsp;[9]:</div>
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task1&#39;</span><span class="p">)</span>
-<span class="n">data_frame</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;poisson2d.efficiency.csv&#39;</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">&#39;,&#39;</span><span class="p">)</span>
-<span class="n">data_frame</span>
+<span class="o">!</span>make run.solution
 </pre></div>
 
     </div>
 </div>
 </div>
 
-</div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="References">References<a class="anchor-link" href="#References">&#182;</a></h4><ol>
-<li><a href="http://www.openacc.org">http://www.openacc.org</a></li>
-<li><a href="https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf">OpenACC Reference Card</a></li>
-<li><a href="https://www.pgroup.com/resources/pgprof-quickstart.htm">pgprof Quickstart</a></li>
-<li><a href="https://docs.nvidia.com/cuda/profiler-users-guide/index.html">CUDA Toolkit Documentation - Profiler</a> <em>pgprof is based on the NVIDIA Visual Profiler</em></li>
-</ol>
+<div class="output_wrapper">
+<div class="output">
 
-</div>
-</div>
-</div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<p><a href="#top">Back to Top</a></p>
-<hr>
 
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./poisson2d.solution
+Job &lt;25191&gt; is submitted to default queue &lt;batch&gt;.
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+Jacobi relaxation Calculation: 2048 x 2048 mesh
+Calculate reference solution and time serial CPU execution.
+    0, 0.249999
+  100, 0.249760
+  200, 0.249522
+  300, 0.249285
+  400, 0.249048
+GPU execution.
+    0, 0.249999
+  100, 0.249760
+  200, 0.249522
+  300, 0.249285
+  400, 0.249048
+2048x2048: 1 CPU:   5.4691 s, 1 GPU:   0.1866 s, speedup:    29.31
+</pre>
 </div>
 </div>
-</div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h2 id="Task-2:-Apply-Domain-Decomposition">Task 2: <a name="task2" />Apply Domain Decomposition<a class="anchor-link" href="#Task-2:-Apply-Domain-Decomposition">&#182;</a></h2><h3 id="Description">Description<a class="anchor-link" href="#Description">&#182;</a></h3><p>Your task is to apply a domain decomposition and use MPI for the data exchange. Specifically you should</p>
-<ul>
-<li>Handle GPU affinity</li>
-<li>Do the Halo Exchange</li>
-</ul>
-<p><em>Look for</em> <strong>TODOs</strong></p>
-<p>When profiling take a look at how kernel and communication times change when you scale to more GPUs.</p>
-<h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
-<li><a href="/edit/C/task2/poisson2d.c">C Version</a></li>
-<li><a href="/edit/FORTRAN/task2/poisson2d.F03">Fortran Version</a></li>
-</ul>
-<h4 id="File-browser">File browser<a class="anchor-link" href="#File-browser">&#182;</a></h4><p>Can be used to open source files, Makefiles, profiling output.</p>
-<ul>
-<li><a href="/tree/C/task2/">C Version</a></li>
-<li><a href="/tree/FORTRAN/task2/">Fortran Version</a></li>
-</ul>
-<p><strong>Before</strong> executing any of the cells below first execute the next cell to change to the right directory.</p>
 
 </div>
 </div>
+
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="prompt input_prompt">In&nbsp;[10]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task2
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task1&#39;</span><span class="p">)</span>
+<span class="o">!</span>make profile.solution
 </pre></div>
 
     </div>
 </div>
 </div>
 
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off --openmp-profiling off  -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.timeline.pgprof ./poisson2d.solution 3
+Job &lt;25192&gt; is submitted to default queue &lt;batch&gt;.
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+==92054== PGPROF is profiling process 92054, command: ./poisson2d.solution 3
+==92054== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.timeline.pgprof
+Jacobi relaxation Calculation: 2048 x 2048 mesh
+Calculate reference solution and time serial CPU execution.
+    0, 0.249999
+GPU execution.
+    0, 0.249999
+2048x2048: 1 CPU:   0.0465 s, 1 GPU:   0.0154 s, speedup:     3.01
+bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off --openmp-profiling off  --analysis-metrics -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.metrics.pgprof ./poisson2d.solution 3
+Job &lt;25193&gt; is submitted to default queue &lt;batch&gt;.
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+==71647== PGPROF is profiling process 71647, command: ./poisson2d.solution 3
+==71647== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
+==71647== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.metrics.pgprof
+Jacobi relaxation Calculation: 2048 x 2048 mesh
+Calculate reference solution and time serial CPU execution.
+    0, 0.249999
+GPU execution.
+    0, 0.249999
+2048x2048: 1 CPU:   0.0476 s, 1 GPU:  12.4561 s, speedup:     0.00
+bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off --openmp-profiling off  --metrics gld_efficiency,gst_efficiency -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.efficiency.pgprof ./poisson2d.solution 3
+Job &lt;25194&gt; is submitted to default queue &lt;batch&gt;.
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+==92292== PGPROF is profiling process 92292, command: ./poisson2d.solution 3
+==92292== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
+==92292== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.efficiency.pgprof
+Jacobi relaxation Calculation: 2048 x 2048 mesh
+Calculate reference solution and time serial CPU execution.
+    0, 0.249999
+GPU execution.
+    0, 0.249999
+2048x2048: 1 CPU:   0.0487 s, 1 GPU:   0.6897 s, speedup:     0.07
+pgprof --csv -i /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.efficiency.pgprof 2&gt;&amp;1 | grep -v &#34;======&#34; &gt; poisson2d.solution.efficiency.csv
+mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.*.pgprof .
+tar -cvzf pgprof.poisson2d.Task1.solution.tar.gz  poisson2d.solution.*.pgprof
+poisson2d.solution.efficiency.pgprof
+poisson2d.solution.metrics.pgprof
+poisson2d.solution.timeline.pgprof
+</pre>
+</div>
+</div>
+
+</div>
+</div>
+
 </div>
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Compilation">Compilation<a class="anchor-link" href="#Compilation">&#182;</a></h4><p>If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call <code>make</code> with the desired <a href="#make">target</a>.
-Alternatively you can just navigate to the right directory and execute <code>make &lt;target&gt;</code> in your terminal.</p>
+<p>For the <em>Global Memory Load/Store Efficiency</em> the <code>make profile</code> command also generated a CSV file that you can import and view with the cell below.<br>
+If you purely work in a terminal you can view the same output by running <code>pgprof -i poisson2d.efficiency.solution.pgprof</code>.</p>
 
 </div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="prompt input_prompt">In&nbsp;[11]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task2&#39;</span><span class="p">)</span>
-<span class="o">!</span>make poisson2d
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">data_frame_solution</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;poisson2d.solution.efficiency.csv&#39;</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">&#39;,&#39;</span><span class="p">)</span>
+<span class="n">data_frame_solution</span>
 </pre></div>
 
     </div>
 </div>
 </div>
 
-</div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Running">Running<a class="anchor-link" href="#Running">&#182;</a></h4><p>For the Multi-GPU version you can set the number of GPUs / MPI ranks using the variable <code>NP</code>. On <em>Ascent</em> within a single node you can use up to 6 GPUs.</p>
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt output_prompt">Out[11]:</div>
+
+
+
+<div class="output_html rendered_html output_subarea output_execute_result">
+<div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
 
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>Device</th>
+      <th>Kernel</th>
+      <th>Invocations</th>
+      <th>Metric Name</th>
+      <th>Metric Description</th>
+      <th>Min</th>
+      <th>Max</th>
+      <th>Avg</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>Tesla V100-SXM2-16GB (0)</td>
+      <td>main_98_gpu</td>
+      <td>3</td>
+      <td>gld_efficiency</td>
+      <td>Global Memory Load Efficiency</td>
+      <td>90.868353%</td>
+      <td>90.896134%</td>
+      <td>90.881874%</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>Tesla V100-SXM2-16GB (0)</td>
+      <td>main_98_gpu</td>
+      <td>3</td>
+      <td>gst_efficiency</td>
+      <td>Global Memory Store Efficiency</td>
+      <td>88.956522%</td>
+      <td>88.956522%</td>
+      <td>88.956522%</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>Tesla V100-SXM2-16GB (0)</td>
+      <td>main_106_gpu</td>
+      <td>3</td>
+      <td>gld_efficiency</td>
+      <td>Global Memory Load Efficiency</td>
+      <td>94.722222%</td>
+      <td>94.722222%</td>
+      <td>94.722222%</td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>Tesla V100-SXM2-16GB (0)</td>
+      <td>main_106_gpu</td>
+      <td>3</td>
+      <td>gst_efficiency</td>
+      <td>Global Memory Store Efficiency</td>
+      <td>88.956522%</td>
+      <td>88.956522%</td>
+      <td>88.956522%</td>
+    </tr>
+    <tr>
+      <th>4</th>
+      <td>Tesla V100-SXM2-16GB (0)</td>
+      <td>main_94_gpu__red</td>
+      <td>3</td>
+      <td>gld_efficiency</td>
+      <td>Global Memory Load Efficiency</td>
+      <td>99.756335%</td>
+      <td>99.756335%</td>
+      <td>99.756335%</td>
+    </tr>
+    <tr>
+      <th>5</th>
+      <td>Tesla V100-SXM2-16GB (0)</td>
+      <td>main_94_gpu__red</td>
+      <td>3</td>
+      <td>gst_efficiency</td>
+      <td>Global Memory Store Efficiency</td>
+      <td>25.000000%</td>
+      <td>25.000000%</td>
+      <td>25.000000%</td>
+    </tr>
+    <tr>
+      <th>6</th>
+      <td>Tesla V100-SXM2-16GB (0)</td>
+      <td>main_66_gpu</td>
+      <td>1</td>
+      <td>gld_efficiency</td>
+      <td>Global Memory Load Efficiency</td>
+      <td>0.000000%</td>
+      <td>0.000000%</td>
+      <td>0.000000%</td>
+    </tr>
+    <tr>
+      <th>7</th>
+      <td>Tesla V100-SXM2-16GB (0)</td>
+      <td>main_66_gpu</td>
+      <td>1</td>
+      <td>gst_efficiency</td>
+      <td>Global Memory Store Efficiency</td>
+      <td>100.000000%</td>
+      <td>100.000000%</td>
+      <td>100.000000%</td>
+    </tr>
+    <tr>
+      <th>8</th>
+      <td>Tesla V100-SXM2-16GB (0)</td>
+      <td>main_88_gpu</td>
+      <td>3</td>
+      <td>gld_efficiency</td>
+      <td>Global Memory Load Efficiency</td>
+      <td>91.834032%</td>
+      <td>91.855433%</td>
+      <td>91.843628%</td>
+    </tr>
+    <tr>
+      <th>9</th>
+      <td>Tesla V100-SXM2-16GB (0)</td>
+      <td>main_88_gpu</td>
+      <td>3</td>
+      <td>gst_efficiency</td>
+      <td>Global Memory Store Efficiency</td>
+      <td>88.845486%</td>
+      <td>88.845486%</td>
+      <td>88.845486%</td>
+    </tr>
+    <tr>
+      <th>10</th>
+      <td>Tesla V100-SXM2-16GB (0)</td>
+      <td>main_111_gpu</td>
+      <td>3</td>
+      <td>gld_efficiency</td>
+      <td>Global Memory Load Efficiency</td>
+      <td>25.000000%</td>
+      <td>25.000000%</td>
+      <td>25.000000%</td>
+    </tr>
+    <tr>
+      <th>11</th>
+      <td>Tesla V100-SXM2-16GB (0)</td>
+      <td>main_111_gpu</td>
+      <td>3</td>
+      <td>gst_efficiency</td>
+      <td>Global Memory Store Efficiency</td>
+      <td>25.000000%</td>
+      <td>25.000000%</td>
+      <td>25.000000%</td>
+    </tr>
+  </tbody>
+</table>
 </div>
 </div>
+
 </div>
-<div class="cell border-box-sizing code_cell rendered">
-<div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
-<div class="inner_cell">
-    <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task2&#39;</span><span class="p">)</span>
-<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run
-</pre></div>
 
-    </div>
 </div>
 </div>
 
@@ -13619,38 +13911,70 @@ Alternatively you can just navigate to the right directory and execute <code>mak
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Scaling">Scaling<a class="anchor-link" href="#Scaling">&#182;</a></h4><p>You can do a simple scaling run for up to all 6 GPUs in the node by executing the next cell.</p>
+<p><a href="#top">Back to Top</a></p>
+<hr>
 
 </div>
 </div>
 </div>
-<div class="cell border-box-sizing code_cell rendered">
-<div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
-<div class="inner_cell">
-    <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task2&#39;</span><span class="p">)</span>
-<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">1</span> make run <span class="p">|</span> grep speedup &gt; scale.out
-<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run <span class="p">|</span> grep speedup &gt;&gt; scale.out
-<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">4</span> make run <span class="p">|</span> grep speedup &gt;&gt;  scale.out
-<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">6</span> make run <span class="p">|</span> grep speedup &gt;&gt;  scale.out
-<span class="n">data_frame2</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;scale.out&#39;</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h2 id="Solution-2:">Solution 2:<a name="solution2" /><a class="anchor-link" href="#Solution-2:">&#182;</a></h2><p>Set the GPU used by the rank using <code>#pragma acc set device_num</code></p>
+<div class="highlight"><pre><span></span><span class="c1">//Initialize MPI and determine rank and size</span>
+<span class="n">MPI_Init</span><span class="p">(</span><span class="o">&amp;</span><span class="n">argc</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">argv</span><span class="p">);</span>
+<span class="n">MPI_Comm_rank</span><span class="p">(</span><span class="n">MPI_COMM_WORLD</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">rank</span><span class="p">);</span>
+<span class="n">MPI_Comm_size</span><span class="p">(</span><span class="n">MPI_COMM_WORLD</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">size</span><span class="p">);</span>
 
-<span class="o">!</span>rm scale.out
+<span class="cp">#pragma acc set device_num( rank )</span>
 
-<span class="n">data_frame2b</span><span class="o">=</span><span class="n">data_frame2</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,[</span><span class="mi">5</span><span class="p">,</span><span class="mi">7</span><span class="p">,</span><span class="mi">10</span><span class="p">,</span><span class="mi">12</span><span class="p">]]</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
-<span class="n">data_frame2b</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="mi">5</span><span class="p">:</span><span class="s1">&#39;GPUs&#39;</span><span class="p">,</span> <span class="mi">7</span><span class="p">:</span> <span class="s1">&#39;time [s]&#39;</span><span class="p">,</span> <span class="mi">10</span><span class="p">:</span><span class="s1">&#39;speedup&#39;</span><span class="p">,</span> <span class="mi">12</span><span class="p">:</span><span class="s1">&#39;efficiency&#39;</span><span class="p">})</span>
+<span class="n">real</span><span class="o">*</span> <span class="kr">restrict</span> <span class="k">const</span> <span class="n">A</span>    <span class="o">=</span> <span class="p">(</span><span class="n">real</span><span class="o">*</span><span class="p">)</span> <span class="n">malloc</span><span class="p">(</span><span class="n">nx</span><span class="o">*</span><span class="n">ny</span><span class="o">*</span><span class="k">sizeof</span><span class="p">(</span><span class="n">real</span><span class="p">));</span>
+<span class="n">real</span><span class="o">*</span> <span class="kr">restrict</span> <span class="k">const</span> <span class="n">Aref</span> <span class="o">=</span> <span class="p">(</span><span class="n">real</span><span class="o">*</span><span class="p">)</span> <span class="n">malloc</span><span class="p">(</span><span class="n">nx</span><span class="o">*</span><span class="n">ny</span><span class="o">*</span><span class="k">sizeof</span><span class="p">(</span><span class="n">real</span><span class="p">));</span>
+<span class="n">real</span><span class="o">*</span> <span class="kr">restrict</span> <span class="k">const</span> <span class="n">Anew</span> <span class="o">=</span> <span class="p">(</span><span class="n">real</span><span class="o">*</span><span class="p">)</span> <span class="n">malloc</span><span class="p">(</span><span class="n">nx</span><span class="o">*</span><span class="n">ny</span><span class="o">*</span><span class="k">sizeof</span><span class="p">(</span><span class="n">real</span><span class="p">));</span>
+<span class="n">real</span><span class="o">*</span> <span class="kr">restrict</span> <span class="k">const</span> <span class="n">rhs</span>  <span class="o">=</span> <span class="p">(</span><span class="n">real</span><span class="o">*</span><span class="p">)</span> <span class="n">malloc</span><span class="p">(</span><span class="n">nx</span><span class="o">*</span><span class="n">ny</span><span class="o">*</span><span class="k">sizeof</span><span class="p">(</span><span class="n">real</span><span class="p">));</span>
 </pre></div>
+<p>Apply domain decomposition</p>
+<div class="highlight"><pre><span></span><span class="c1">// Ensure correctness if ny%size != 0</span>
+<span class="kt">int</span> <span class="n">chunk_size</span> <span class="o">=</span> <span class="n">ceil</span><span class="p">(</span> <span class="p">(</span><span class="mf">1.0</span><span class="o">*</span><span class="n">ny</span><span class="p">)</span><span class="o">/</span><span class="n">size</span> <span class="p">);</span>
+
+<span class="kt">int</span> <span class="n">iy_start</span> <span class="o">=</span> <span class="n">rank</span> <span class="o">*</span> <span class="n">chunk_size</span><span class="p">;</span>
+<span class="kt">int</span> <span class="n">iy_end</span>   <span class="o">=</span> <span class="n">iy_start</span> <span class="o">+</span> <span class="n">chunk_size</span><span class="p">;</span>
+
+<span class="c1">// Do not process boundaries</span>
+<span class="n">iy_start</span> <span class="o">=</span> <span class="n">max</span><span class="p">(</span> <span class="n">iy_start</span><span class="p">,</span> <span class="mi">1</span> <span class="p">);</span>
+<span class="n">iy_end</span> <span class="o">=</span> <span class="n">min</span><span class="p">(</span> <span class="n">iy_end</span><span class="p">,</span> <span class="n">ny</span> <span class="o">-</span> <span class="mi">1</span> <span class="p">);</span>
+</pre></div>
+<p>Exchange data</p>
+<div class="highlight"><pre><span></span><span class="c1">//Periodic boundary conditions</span>
+<span class="kt">int</span> <span class="n">top</span>    <span class="o">=</span> <span class="p">(</span><span class="n">rank</span> <span class="o">==</span> <span class="mi">0</span><span class="p">)</span> <span class="o">?</span> <span class="p">(</span><span class="n">size</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">:</span> <span class="n">rank</span><span class="o">-</span><span class="mi">1</span><span class="p">;</span>
+<span class="kt">int</span> <span class="n">bottom</span> <span class="o">=</span> <span class="p">(</span><span class="n">rank</span> <span class="o">==</span> <span class="p">(</span><span class="n">size</span><span class="o">-</span><span class="mi">1</span><span class="p">))</span> <span class="o">?</span> <span class="mi">0</span> <span class="o">:</span> <span class="n">rank</span><span class="o">+</span><span class="mi">1</span><span class="p">;</span>
+<span class="cp">#pragma acc host_data use_device( A )</span>
+<span class="p">{</span>
+    <span class="kt">double</span> <span class="n">start_mpi</span> <span class="o">=</span> <span class="n">MPI_Wtime</span><span class="p">();</span>
+    <span class="c1">//1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom</span>
+    <span class="n">MPI_Sendrecv</span><span class="p">(</span> <span class="n">A</span><span class="o">+</span><span class="n">iy_start</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span> <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">top</span>   <span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
+                  <span class="n">A</span><span class="o">+</span><span class="n">iy_end</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span>   <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">bottom</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
+                  <span class="n">MPI_COMM_WORLD</span><span class="p">,</span> <span class="n">MPI_STATUS_IGNORE</span> <span class="p">);</span>
+
+    <span class="c1">//2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top</span>
+    <span class="n">MPI_Sendrecv</span><span class="p">(</span> <span class="n">A</span><span class="o">+</span><span class="p">(</span><span class="n">iy_end</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span>   <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">bottom</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
+                  <span class="n">A</span><span class="o">+</span><span class="p">(</span><span class="n">iy_start</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span> <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">top</span>   <span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
+                  <span class="n">MPI_COMM_WORLD</span><span class="p">,</span> <span class="n">MPI_STATUS_IGNORE</span> <span class="p">);</span>
+    <span class="n">mpi_time</span> <span class="o">+=</span> <span class="n">MPI_Wtime</span><span class="p">()</span> <span class="o">-</span> <span class="n">start_mpi</span><span class="p">;</span>
+<span class="p">}</span>
+</pre></div>
+<h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
+<li><a href="/edit/C/task2/poisson2d.solution.c">C Version</a></li>
+<li><a href="/edit/FORTRAN/task2/poisson2d.solution.F03">Fortran Version</a></li>
+</ul>
 
-    </div>
 </div>
 </div>
-
 </div>
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling finished the output files can be downloaded from here: <a href="/tree/C/task2/pgprof.poisson2d.Task2.tar.gz?download=1">C Version</a> / <a href="/tree/FORTRAN/task2/pgprof.poisson2d.Task2.tar.gz?download=1">Fortran Version</a>.
+<h4 id="Compiling,-Running-and-Profiling">Compiling, Running and Profiling<a class="anchor-link" href="#Compiling,-Running-and-Profiling">&#182;</a></h4><p>You can compile, run and profile the solution with the next cells.  You can profile the code by executing the next cell. <strong>After</strong> the profiling completed download the tarball containing the profiles (<code>pgprof.Task2.solution.poisson2d.tar.gz</code>) with the File Browser. 
 Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu. Remember to use the <em>Multiple processes</em> option in the assistant.</p>
 
 </div>
@@ -13658,118 +13982,218 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="prompt input_prompt">In&nbsp;[12]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task2&#39;</span><span class="p">)</span>
-<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make profile
+<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task2
 </pre></div>
 
     </div>
 </div>
 </div>
 
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task2
+</pre>
+</div>
 </div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="References">References<a class="anchor-link" href="#References">&#182;</a></h4><ol>
-<li><a href="http://www.openacc.org">http://www.openacc.org</a></li>
-<li><a href="https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf">OpenACC Reference Card</a></li>
-<li><a href="https://www.open-mpi.org/doc/v3.1/">https://www.open-mpi.org/doc/v3.1/</a></li>
-</ol>
 
 </div>
 </div>
+
 </div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<p><a href="#top">Back to Top</a></p>
-<hr>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[13]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task2&#39;</span><span class="p">)</span>
+<span class="o">!</span>make poisson2d.solution
+</pre></div>
 
+    </div>
 </div>
 </div>
+
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>mpicc -c -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d_serial.c -o poisson2d_serial.o
+poisson2d_serial:
+     36, Generating present(Anew[:],rhs[:],Aref[:])
+     39, Generating update device(rhs[:ny*nx],Aref[:ny*nx])
+     42, Generating Tesla code
+         43, #pragma acc loop gang /* blockIdx.x */
+         44, #pragma acc loop vector(128) /* threadIdx.x */
+         49, Generating implicit reduction(max:error)
+     44, Loop is parallelizable
+     53, Generating Tesla code
+         54, #pragma acc loop gang /* blockIdx.x */
+         55, #pragma acc loop vector(128) /* threadIdx.x */
+     55, Loop is parallelizable
+     61, Generating Tesla code
+         62, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+     66, Generating Tesla code
+         67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+     78, Generating update self(Aref[:ny*nx])
+mpicc -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution
+poisson2d.solution.c:
+main:
+     71, Generating enter data create(Aref[:ny*nx],rhs[:ny*nx],A[:ny*nx],Anew[:ny*nx])
+     87, Generating present(Aref[:],A[:])
+         Generating Tesla code
+         88, #pragma acc loop gang /* blockIdx.x */
+         89, #pragma acc loop vector(128) /* threadIdx.x */
+     89, Loop is parallelizable
+    140, Generating update device(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)],rhs[nx*iy_start:nx*(iy_end-iy_start)])
+    143, Generating present(A[:],rhs[:],Anew[:])
+         Generating Tesla code
+        144, #pragma acc loop gang /* blockIdx.x */
+        145, #pragma acc loop vector(128) /* threadIdx.x */
+        149, Generating implicit reduction(max:error)
+    145, Loop is parallelizable
+    157, Generating present(Anew[:],A[:])
+         Generating Tesla code
+        158, #pragma acc loop gang /* blockIdx.x */
+        159, #pragma acc loop vector(128) /* threadIdx.x */
+    159, Loop is parallelizable
+    184, Generating present(A[:])
+         Generating Tesla code
+        185, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+    195, Generating update self(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)])
+    213, Generating exit data delete(rhs[:1],Aref[:1],A[:1],Anew[:1])
+</pre>
+</div>
 </div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h2 id="Task-3:-Hide-MPI-Communication-time">Task 3: <a name="task3" />Hide MPI Communication time<a class="anchor-link" href="#Task-3:-Hide-MPI-Communication-time">&#182;</a></h2><p>To overlap compute and communication you will need to</p>
-<ul>
-<li>start the copy loop asynchronously</li>
-<li>wait for async copy loop after MPI communication has finished</li>
-</ul>
-<p><em>Look for</em> <strong>TODOs</strong>.</p>
-<p>Compare the scaling and efficiency with the results from the previous task. Check for the overlap in the profiler.</p>
-<p><em>Optional</em>: Try to understand how well communication and compute overlap is able to improve efficiency when scaling to more GPUs.</p>
-<h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
-<li><a href="/edit/C/task3/poisson2d.c">C Version</a></li>
-<li><a href="/edit/FORTRAN/task3/poisson2d.F03">Fortran Version</a></li>
-</ul>
-<h4 id="File-browser">File browser<a class="anchor-link" href="#File-browser">&#182;</a></h4><p>Can be used to open source files, Makefiles, profiling output.</p>
-<ul>
-<li><a href="/tree/C/task3/">C Version</a></li>
-<li><a href="/tree/FORTRAN/task3/">Fortran Version</a></li>
-</ul>
-<p><strong>Before</strong> executing any of the cells below first execute the next cell to change to the right directory.</p>
 
 </div>
 </div>
+
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="prompt input_prompt">In&nbsp;[14]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task3
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task2&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run.solution
 </pre></div>
 
     </div>
 </div>
 </div>
 
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; ./poisson2d.solution
+Job &lt;25195&gt; is submitted to default queue &lt;batch&gt;.
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+Jacobi relaxation Calculation: 4096 x 4096 mesh
+Calculate reference solution and time serial execution.
+    0, 0.250000
+  100, 0.249940
+  200, 0.249880
+  300, 0.249821
+  400, 0.249761
+  500, 0.249702
+  600, 0.249642
+  700, 0.249583
+  800, 0.249524
+  900, 0.249464
+Parallel execution.
+    0, 0.250000
+  100, 0.249940
+  200, 0.249880
+  300, 0.249821
+  400, 0.249761
+  500, 0.249702
+  600, 0.249642
+  700, 0.249583
+  800, 0.249524
+  900, 0.249464
+Num GPUs: 2.
+4096x4096: 1 GPU:   1.3165 s, 2 GPUs:   0.7221 s, speedup:     1.82, efficiency:    91.17%
+MPI time:   0.0422 s, inter GPU BW:     2.89 GiB/s
+</pre>
+</div>
 </div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Compilation">Compilation<a class="anchor-link" href="#Compilation">&#182;</a></h4><p>If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call <code>make</code> with the desired <a href="#make">target</a>.
-Alternatively you can just navigate to the right directory and execute <code>make &lt;target&gt;</code> in your terminal.</p>
 
 </div>
 </div>
+
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="prompt input_prompt">In&nbsp;[15]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task3&#39;</span><span class="p">)</span>
-<span class="o">!</span>make poisson2d
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task2&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make profile.solution
 </pre></div>
 
     </div>
 </div>
 </div>
 
-</div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Running">Running<a class="anchor-link" href="#Running">&#182;</a></h4><p>For the Multi-GPU version you can set the number of GPUs / MPI ranks using the variable <code>NP</code>. On <em>Ascent</em> within a single node you can use up to 6 GPUs.</p>
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
 
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task2.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+Job &lt;25196&gt; is submitted to default queue &lt;batch&gt;.
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+==92521== PGPROF is profiling process 92521, command: ./poisson2d.solution 10
+==92520== PGPROF is profiling process 92520, command: ./poisson2d.solution 10
+==92520== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task2.NP2.1.pgprof
+Jacobi relaxation Calculation: 4096 x 4096 mesh
+Calculate reference solution and time serial execution.
+    0, 0.250000
+Parallel execution.
+    0, 0.250000
+Num GPUs: 2.
+4096x4096: 1 GPU:   0.0224 s, 2 GPUs:   0.0130 s, speedup:     1.73, efficiency:    86.37%
+MPI time:   0.0007 s, inter GPU BW:     1.75 GiB/s
+==92521== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task2.NP2.0.pgprof
+mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task2.NP2.?.pgprof  .
+tar -cvzf pgprof.poisson2d.Task2.solution.tar.gz poisson2d.solution.Task2.NP2.?.pgprof
+poisson2d.solution.Task2.NP2.0.pgprof
+poisson2d.solution.Task2.NP2.1.pgprof
+</pre>
 </div>
 </div>
-</div>
-<div class="cell border-box-sizing code_cell rendered">
-<div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
-<div class="inner_cell">
-    <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task3&#39;</span><span class="p">)</span>
-<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run
-</pre></div>
 
-    </div>
 </div>
 </div>
 
@@ -13784,78 +14208,118 @@ Alternatively you can just navigate to the right directory and execute <code>mak
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
+<div class="prompt input_prompt">In&nbsp;[16]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task3&#39;</span><span class="p">)</span>
-<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">1</span> make run <span class="p">|</span> grep speedup &gt; scale.out
-<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run <span class="p">|</span> grep speedup &gt;&gt; scale.out
-<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">4</span> make run <span class="p">|</span> grep speedup &gt;&gt;  scale.out
-<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">6</span> make run <span class="p">|</span> grep speedup &gt;&gt;  scale.out
-<span class="n">data_frame3</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;scale.out&#39;</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task2&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">1</span> make run.solution <span class="p">|</span> grep speedup &gt; scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt; scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">4</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt;  scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">6</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt;  scale.out
+<span class="n">data_frameS2</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;scale.out&#39;</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
 
 <span class="o">!</span>rm scale.out
 
-<span class="n">data_frame3b</span><span class="o">=</span><span class="n">data_frame3</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,[</span><span class="mi">5</span><span class="p">,</span><span class="mi">7</span><span class="p">,</span><span class="mi">10</span><span class="p">,</span><span class="mi">12</span><span class="p">]]</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
-<span class="n">data_frame3b</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="mi">5</span><span class="p">:</span><span class="s1">&#39;GPUs&#39;</span><span class="p">,</span> <span class="mi">7</span><span class="p">:</span> <span class="s1">&#39;time [s]&#39;</span><span class="p">,</span> <span class="mi">10</span><span class="p">:</span><span class="s1">&#39;speedup&#39;</span><span class="p">,</span> <span class="mi">12</span><span class="p">:</span><span class="s1">&#39;efficiency&#39;</span><span class="p">})</span>
+<span class="n">data_frameS2b</span><span class="o">=</span><span class="n">data_frameS2</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,[</span><span class="mi">5</span><span class="p">,</span><span class="mi">7</span><span class="p">,</span><span class="mi">10</span><span class="p">,</span><span class="mi">12</span><span class="p">]]</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
+<span class="n">data_frameS2b</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="mi">5</span><span class="p">:</span><span class="s1">&#39;GPUs&#39;</span><span class="p">,</span> <span class="mi">7</span><span class="p">:</span> <span class="s1">&#39;time [s]&#39;</span><span class="p">,</span> <span class="mi">10</span><span class="p">:</span><span class="s1">&#39;speedup&#39;</span><span class="p">,</span> <span class="mi">12</span><span class="p">:</span><span class="s1">&#39;efficiency&#39;</span><span class="p">})</span>
 </pre></div>
 
     </div>
 </div>
 </div>
 
-</div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Profiling">Profiling<a class="anchor-link" href="#Profiling">&#182;</a></h4><p>You can profile the code by executing the next cell. <strong>After</strong> the profiling finished the output files can be downloaded from here: <a href="/tree/C/task3/pgprof.poisson2d.Task3.tar.gz?download=1">C Version</a> / <a href="/tree/FORTRAN/task3/pgprof.poisson2d.Task3.tar.gz?download=1">Fortran Version</a>.
-Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu. Remember to use the <em>Multiple processes</em> option in the assistant.</p>
+<div class="output_wrapper">
+<div class="output">
 
-</div>
-</div>
-</div>
-<div class="cell border-box-sizing code_cell rendered">
-<div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
-<div class="inner_cell">
-    <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task3&#39;</span><span class="p">)</span>
-<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make profile
-</pre></div>
 
-    </div>
-</div>
-</div>
+<div class="output_area">
 
-</div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="References">References<a class="anchor-link" href="#References">&#182;</a></h4><ol>
-<li><a href="http://www.openacc.org">http://www.openacc.org</a></li>
-<li><a href="https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf">OpenACC Reference Card</a></li>
-<li><a href="https://www.open-mpi.org/doc/v3.1/">https://www.open-mpi.org/doc/v3.1/</a></li>
-</ol>
+    <div class="prompt"></div>
 
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+</pre>
 </div>
 </div>
-</div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<hr>
-<hr>
 
+<div class="output_area">
+
+    <div class="prompt output_prompt">Out[16]:</div>
+
+
+
+<div class="output_html rendered_html output_subarea output_execute_result">
+<div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
+
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>GPUs</th>
+      <th>time [s]</th>
+      <th>speedup</th>
+      <th>efficiency</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>1</td>
+      <td>1.4201</td>
+      <td>0.93,</td>
+      <td>92.67%</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>2</td>
+      <td>0.7157</td>
+      <td>1.83,</td>
+      <td>91.44%</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>4</td>
+      <td>0.4301</td>
+      <td>3.08,</td>
+      <td>76.91%</td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>6</td>
+      <td>0.3037</td>
+      <td>4.32,</td>
+      <td>71.94%</td>
+    </tr>
+  </tbody>
+</table>
 </div>
 </div>
+
 </div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<h1 id="Solutions">Solutions<a name="solutions" /><a class="anchor-link" href="#Solutions">&#182;</a></h1><p>Below are suggested solutions. This is only a short description of the solution, but the <code>poisson2d.solution.(c|F03)</code> files linked below have the full source code. If you want to run / profile the solutions feel free to duplicate the cells for the tasks and change the <a href="#make">make target</a> to the <code>*.solution</code> ones.</p>
 
 </div>
 </div>
+
 </div>
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
@@ -13869,26 +14333,46 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h2 id="Solution-0:">Solution 0:<a name="solution0" /><a class="anchor-link" href="#Solution-0:">&#182;</a></h2><div class="highlight"><pre><span></span><span class="cp">#pragma acc parallel loop</span>
-<span class="k">for</span> <span class="p">(</span><span class="kt">int</span> <span class="n">ix</span> <span class="o">=</span> <span class="n">ix_start</span><span class="p">;</span> <span class="n">ix</span> <span class="o">&lt;</span> <span class="n">ix_end</span><span class="p">;</span> <span class="n">ix</span><span class="o">++</span><span class="p">)</span>
+<h2 id="Solution-3:">Solution 3:<a name="solution3" /><a class="anchor-link" href="#Solution-3:">&#182;</a></h2><p>Update the boundaries first.</p>
+<div class="highlight"><pre><span></span><span class="cp">#pragma acc parallel loop present(A,Anew)</span>
+<span class="k">for</span><span class="p">(</span> <span class="kt">int</span> <span class="n">ix</span> <span class="o">=</span> <span class="n">ix_start</span><span class="p">;</span> <span class="n">ix</span> <span class="o">&lt;</span> <span class="n">ix_end</span><span class="p">;</span> <span class="n">ix</span><span class="o">++</span> <span class="p">)</span>
 <span class="p">{</span>
-    <span class="cp">#pragma acc loop</span>
-    <span class="k">for</span><span class="p">(</span> <span class="kt">int</span> <span class="n">iy</span> <span class="o">=</span> <span class="n">iy_start</span><span class="p">;</span> <span class="n">iy</span> <span class="o">&lt;</span> <span class="n">iy_end</span><span class="p">;</span> <span class="n">iy</span><span class="o">++</span> <span class="p">)</span>
+    <span class="n">A</span><span class="p">[(</span><span class="n">iy_start</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">=</span> <span class="n">Anew</span><span class="p">[(</span><span class="n">iy_start</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">];</span>
+    <span class="n">A</span><span class="p">[(</span><span class="n">iy_end</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">=</span> <span class="n">Anew</span><span class="p">[(</span><span class="n">iy_end</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">];</span>
+<span class="p">}</span>
+</pre></div>
+<p>Start the interior loop asynchronously so it can overlap with the MPI communication and wait at the end for the completion.</p>
+<div class="highlight"><pre><span></span><span class="cp">#pragma acc parallel loop present(A,Anew) async</span>
+<span class="k">for</span> <span class="p">(</span><span class="kt">int</span> <span class="n">iy</span> <span class="o">=</span> <span class="n">iy_start</span><span class="o">+</span><span class="mi">1</span><span class="p">;</span> <span class="n">iy</span> <span class="o">&lt;</span> <span class="n">iy_end</span><span class="o">-</span><span class="mi">1</span><span class="p">;</span> <span class="n">iy</span><span class="o">++</span><span class="p">)</span>
+<span class="p">{</span>
+    <span class="k">for</span><span class="p">(</span> <span class="kt">int</span> <span class="n">ix</span> <span class="o">=</span> <span class="n">ix_start</span><span class="p">;</span> <span class="n">ix</span> <span class="o">&lt;</span> <span class="n">ix_end</span><span class="p">;</span> <span class="n">ix</span><span class="o">++</span> <span class="p">)</span>
     <span class="p">{</span>
-        <span class="n">Anew</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">=</span> <span class="o">-</span><span class="mf">0.25</span> <span class="o">*</span> <span class="p">(</span><span class="n">rhs</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">-</span> <span class="p">(</span> <span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="o">+</span><span class="mi">1</span><span class="p">]</span> <span class="o">+</span> <span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
-                                               <span class="o">+</span> <span class="n">A</span><span class="p">[(</span><span class="n">iy</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">+</span> <span class="n">A</span><span class="p">[(</span><span class="n">iy</span><span class="o">+</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="p">));</span>
-        <span class="n">error</span> <span class="o">=</span> <span class="n">fmaxr</span><span class="p">(</span> <span class="n">error</span><span class="p">,</span> <span class="n">fabsr</span><span class="p">(</span><span class="n">Anew</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span><span class="o">-</span><span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]));</span>
+        <span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">=</span> <span class="n">Anew</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">];</span>
     <span class="p">}</span>
 <span class="p">}</span>
+
+<span class="c1">//Periodic boundary conditions</span>
+<span class="kt">int</span> <span class="n">top</span>    <span class="o">=</span> <span class="p">(</span><span class="n">rank</span> <span class="o">==</span> <span class="mi">0</span><span class="p">)</span> <span class="o">?</span> <span class="p">(</span><span class="n">size</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">:</span> <span class="n">rank</span><span class="o">-</span><span class="mi">1</span><span class="p">;</span>
+<span class="kt">int</span> <span class="n">bottom</span> <span class="o">=</span> <span class="p">(</span><span class="n">rank</span> <span class="o">==</span> <span class="p">(</span><span class="n">size</span><span class="o">-</span><span class="mi">1</span><span class="p">))</span> <span class="o">?</span> <span class="mi">0</span> <span class="o">:</span> <span class="n">rank</span><span class="o">+</span><span class="mi">1</span><span class="p">;</span>
+<span class="cp">#pragma acc host_data use_device( A )</span>
+<span class="p">{</span>
+    <span class="kt">double</span> <span class="n">start_mpi</span> <span class="o">=</span> <span class="n">MPI_Wtime</span><span class="p">();</span>
+    <span class="c1">//1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom</span>
+    <span class="n">MPI_Sendrecv</span><span class="p">(</span> <span class="n">A</span><span class="o">+</span><span class="n">iy_start</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span> <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">top</span>   <span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
+                  <span class="n">A</span><span class="o">+</span><span class="n">iy_end</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span>   <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">bottom</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
+                  <span class="n">MPI_COMM_WORLD</span><span class="p">,</span> <span class="n">MPI_STATUS_IGNORE</span> <span class="p">);</span>
+
+    <span class="c1">//2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top</span>
+    <span class="n">MPI_Sendrecv</span><span class="p">(</span> <span class="n">A</span><span class="o">+</span><span class="p">(</span><span class="n">iy_end</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span>   <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">bottom</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
+                  <span class="n">A</span><span class="o">+</span><span class="p">(</span><span class="n">iy_start</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span> <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">top</span>   <span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
+                  <span class="n">MPI_COMM_WORLD</span><span class="p">,</span> <span class="n">MPI_STATUS_IGNORE</span> <span class="p">);</span>
+    <span class="n">mpi_time</span> <span class="o">+=</span> <span class="n">MPI_Wtime</span><span class="p">()</span> <span class="o">-</span> <span class="n">start_mpi</span><span class="p">;</span>
+<span class="p">}</span>
+<span class="cp">#pragma acc wait</span>
 </pre></div>
 <h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
-<li><a href="/edit/C/task0/poisson2d.solution.c">C Version</a></li>
-<li><a href="/edit/FORTRAN/task0/poisson2d.solution.F03">Fortran Version</a></li>
-</ul>
-<h4 id="File-browser">File browser<a class="anchor-link" href="#File-browser">&#182;</a></h4><p>Can be used to open source files, Makefiles, profiling output.</p>
-<ul>
-<li><a href="/tree/C/task0/">C Version</a></li>
-<li><a href="/tree/FORTRAN/task0/">Fortran Version</a></li>
+<li><a href="/edit/C/task3/poisson2d.solution.c">C Version</a></li>
+<li><a href="/edit/FORTRAN/task3/poisson2d.solution.F03">Fortran Version</a></li>
 </ul>
 
 </div>
@@ -13897,17 +14381,18 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Compiling,-Running-and-Profiling">Compiling, Running and Profiling<a class="anchor-link" href="#Compiling,-Running-and-Profiling">&#182;</a></h4><p>You can compile, run and profile the solution with the next cells.  <strong>After</strong> the profiling finished the output file <code>poisson2d.solution.pgprof</code>  can be downloaded from here: <a href="/tree/C/task0/poisson2d.solution.pgprof?download=1">C Version</a> / <a href="/tree/FORTRAN/task0/poisson2d.solution.pgprof?download=1">Fortran Version</a>.</p>
+<h4 id="Compiling,-Running-and-Profiling">Compiling, Running and Profiling<a class="anchor-link" href="#Compiling,-Running-and-Profiling">&#182;</a></h4><p>You can compile, run and profile the solution with the next cells.  You can profile the code by executing the next cell. <strong>After</strong> the profiling completed download the tarball containing the profiles (<code>pgprof.Task2.solution.poisson2d.tar.gz</code>) with the File Browser. 
+Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu. Remember to use the <em>Multiple processes</em> option in the assistant.</p>
 
 </div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[22]:</div>
+<div class="prompt input_prompt">In&nbsp;[17]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task0
+<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task3
 </pre></div>
 
     </div>
@@ -13924,7 +14409,7 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>/autofs/nccsopen-svm1_home/mathiasw/sc17task/C/task0
+<pre>/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task3
 </pre>
 </div>
 </div>
@@ -13935,10 +14420,10 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[23]:</div>
+<div class="prompt input_prompt">In&nbsp;[18]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task0&#39;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task3&#39;</span><span class="p">)</span>
 <span class="o">!</span>make poisson2d.solution
 </pre></div>
 
@@ -13956,7 +14441,53 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>make: `poisson2d.solution&#39; is up to date.
+<pre>mpicc -c -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d_serial.c -o poisson2d_serial.o
+poisson2d_serial:
+     36, Generating present(Anew[:],rhs[:],Aref[:])
+     39, Generating update device(rhs[:ny*nx],Aref[:ny*nx])
+     42, Generating Tesla code
+         43, #pragma acc loop gang /* blockIdx.x */
+         44, #pragma acc loop vector(128) /* threadIdx.x */
+         49, Generating implicit reduction(max:error)
+     44, Loop is parallelizable
+     53, Generating Tesla code
+         54, #pragma acc loop gang /* blockIdx.x */
+         55, #pragma acc loop vector(128) /* threadIdx.x */
+     55, Loop is parallelizable
+     61, Generating Tesla code
+         62, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+     66, Generating Tesla code
+         67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+     78, Generating update self(Aref[:ny*nx])
+mpicc -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution
+poisson2d.solution.c:
+main:
+     71, Generating enter data create(rhs[:ny*nx],Aref[:ny*nx],A[:ny*nx],Anew[:ny*nx])
+     87, Generating present(Aref[:],A[:])
+         Generating Tesla code
+         88, #pragma acc loop gang /* blockIdx.x */
+         89, #pragma acc loop vector(128) /* threadIdx.x */
+     89, Loop is parallelizable
+    140, Generating update device(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)],rhs[nx*iy_start:nx*(iy_end-iy_start)])
+    143, Generating present(A[:],rhs[:],Anew[:])
+         Generating Tesla code
+        144, #pragma acc loop gang /* blockIdx.x */
+        145, #pragma acc loop vector(128) /* threadIdx.x */
+        149, Generating implicit reduction(max:error)
+    145, Loop is parallelizable
+    157, Generating present(Anew[:],A[:])
+         Generating Tesla code
+        158, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+    163, Generating present(Anew[:],A[:])
+         Generating Tesla code
+        164, #pragma acc loop gang /* blockIdx.x */
+        165, #pragma acc loop vector(128) /* threadIdx.x */
+    165, Loop is parallelizable
+    191, Generating present(A[:])
+         Generating Tesla code
+        192, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+    202, Generating update self(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)])
+    220, Generating exit data delete(rhs[:1],Aref[:1],A[:1],Anew[:1])
 </pre>
 </div>
 </div>
@@ -13967,11 +14498,11 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[24]:</div>
+<div class="prompt input_prompt">In&nbsp;[19]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task0&#39;</span><span class="p">)</span>
-<span class="o">!</span>make run.solution
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task3&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run.solution
 </pre></div>
 
     </div>
@@ -13988,24 +14519,87 @@ Then you can import them into pgprof / nvvp using the <em>Import</em> option in
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS ./poisson2d.solution
-Job &lt;4697&gt; is submitted to default queue &lt;batch&gt;.
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; ./poisson2d.solution
+Job &lt;25201&gt; is submitted to default queue &lt;batch&gt;.
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
-Jacobi relaxation Calculation: 2048 x 2048 mesh
-Calculate reference solution and time serial CPU execution.
-    0, 0.249999
-  100, 0.249760
-  200, 0.249522
-  300, 0.249285
-  400, 0.249048
-GPU execution.
-    0, 0.249999
-  100, 0.249760
-  200, 0.249522
-  300, 0.249285
-  400, 0.249048
-2048x2048: 1 CPU:   5.5979 s, 1 GPU:   0.2241 s, speedup:    24.98
+Jacobi relaxation Calculation: 4096 x 4096 mesh
+Calculate reference solution and time serial execution.
+    0, 0.250000
+  100, 0.249940
+  200, 0.249880
+  300, 0.249821
+  400, 0.249761
+  500, 0.249702
+  600, 0.249642
+  700, 0.249583
+  800, 0.249524
+  900, 0.249464
+Parallel execution.
+    0, 0.250000
+  100, 0.249940
+  200, 0.249880
+  300, 0.249821
+  400, 0.249761
+  500, 0.249702
+  600, 0.249642
+  700, 0.249583
+  800, 0.249524
+  900, 0.249464
+Num GPUs: 2.
+4096x4096: 1 GPU:   1.3175 s, 2 GPUs:   0.6962 s, speedup:     1.89, efficiency:    94.62%
+MPI time:   0.0583 s, inter GPU BW:     2.09 GiB/s
+</pre>
+</div>
+</div>
+
+</div>
+</div>
+
+</div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+<div class="prompt input_prompt">In&nbsp;[20]:</div>
+<div class="inner_cell">
+    <div class="input_area">
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task3&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make profile.solution
+</pre></div>
+
+    </div>
+</div>
+</div>
+
+<div class="output_wrapper">
+<div class="output">
+
+
+<div class="output_area">
+
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task3.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+Job &lt;25202&gt; is submitted to default queue &lt;batch&gt;.
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+==93249== PGPROF is profiling process 93249, command: ./poisson2d.solution 10
+==93248== PGPROF is profiling process 93248, command: ./poisson2d.solution 10
+==93249== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task3.NP2.1.pgprof
+Jacobi relaxation Calculation: 4096 x 4096 mesh
+Calculate reference solution and time serial execution.
+    0, 0.250000
+Parallel execution.
+    0, 0.250000
+Num GPUs: 2.
+4096x4096: 1 GPU:   0.0262 s, 2 GPUs:   0.0127 s, speedup:     2.06, efficiency:   103.02%
+MPI time:   0.0009 s, inter GPU BW:     1.39 GiB/s
+==93248== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task3.NP2.0.pgprof
+mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task3.NP2.?.pgprof  .
+tar -cvzf pgprof.poisson2d.Task3.solution.tar.gz poisson2d.solution.Task3.NP2.?.pgprof
+poisson2d.solution.Task3.NP2.0.pgprof
+poisson2d.solution.Task3.NP2.1.pgprof
 </pre>
 </div>
 </div>
@@ -14013,14 +14607,31 @@ GPU execution.
 </div>
 </div>
 
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<h4 id="Scaling">Scaling<a class="anchor-link" href="#Scaling">&#182;</a></h4><p>You can do a simple scaling run for up to all 6 GPUs in the node by executing the next cell.</p>
+
+</div>
+</div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[25]:</div>
+<div class="prompt input_prompt">In&nbsp;[21]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task0&#39;</span><span class="p">)</span>
-<span class="o">!</span>make profile.solution
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task3&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">1</span> make run.solution <span class="p">|</span> grep speedup &gt; scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt; scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">4</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt;  scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">6</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt;  scale.out
+<span class="n">data_frameS3</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;scale.out&#39;</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
+
+<span class="o">!</span>rm scale.out
+
+<span class="n">data_frameS3b</span><span class="o">=</span><span class="n">data_frameS3</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,[</span><span class="mi">5</span><span class="p">,</span><span class="mi">7</span><span class="p">,</span><span class="mi">10</span><span class="p">,</span><span class="mi">12</span><span class="p">]]</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
+<span class="n">data_frameS3b</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="mi">5</span><span class="p">:</span><span class="s1">&#39;GPUs&#39;</span><span class="p">,</span> <span class="mi">7</span><span class="p">:</span> <span class="s1">&#39;time [s]&#39;</span><span class="p">,</span> <span class="mi">10</span><span class="p">:</span><span class="s1">&#39;speedup&#39;</span><span class="p">,</span> <span class="mi">12</span><span class="p">:</span><span class="s1">&#39;efficiency&#39;</span><span class="p">})</span>
 </pre></div>
 
     </div>
@@ -14037,23 +14648,85 @@ GPU execution.
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off  -o /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof ./poisson2d.solution 10
-Job &lt;4698&gt; is submitted to default queue &lt;batch&gt;.
+<pre>&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
-==33475== PGPROF is profiling process 33475, command: ./poisson2d.solution 10
-==33475== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof
-Jacobi relaxation Calculation: 2048 x 2048 mesh
-Calculate reference solution and time serial CPU execution.
-    0, 0.249999
-GPU execution.
-    0, 0.249999
-2048x2048: 1 CPU:   0.1245 s, 1 GPU:   0.0220 s, speedup:     5.66
-mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof .
 </pre>
 </div>
 </div>
 
+<div class="output_area">
+
+    <div class="prompt output_prompt">Out[21]:</div>
+
+
+
+<div class="output_html rendered_html output_subarea output_execute_result">
+<div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
+
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>GPUs</th>
+      <th>time [s]</th>
+      <th>speedup</th>
+      <th>efficiency</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>1</td>
+      <td>1.3935</td>
+      <td>0.94,</td>
+      <td>93.86%</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>2</td>
+      <td>0.6910</td>
+      <td>1.89,</td>
+      <td>94.52%</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>4</td>
+      <td>0.3920</td>
+      <td>3.37,</td>
+      <td>84.13%</td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>6</td>
+      <td>0.2841</td>
+      <td>4.58,</td>
+      <td>76.29%</td>
+    </tr>
+  </tbody>
+</table>
+</div>
+</div>
+
+</div>
+
 </div>
 </div>
 
@@ -14061,6 +14734,8 @@ mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof .
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
+<p>The overlap of compute and communication can be seen in the profiler, e.g. as shown below.</p>
+<p><img src="./resources/Solution3.png" alt="Solution3.png"></p>
 <p><a href="#top">Back to Top</a></p>
 <hr>
 
@@ -14070,26 +14745,56 @@ mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof .
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h2 id="Solution-1:">Solution 1:<a name="solution1" /><a class="anchor-link" href="#Solution-1:">&#182;</a></h2><p>Swap the <code>ix</code> and <code>iy</code> loops to make sure that <code>ix</code> is the fastest running index</p>
-<div class="highlight"><pre><span></span><span class="cp">#pragma acc parallel loop</span>
-<span class="k">for</span> <span class="p">(</span><span class="kt">int</span> <span class="n">iy</span> <span class="o">=</span> <span class="n">iy_start</span><span class="p">;</span> <span class="n">iy</span> <span class="o">&lt;</span> <span class="n">iy_end</span><span class="p">;</span> <span class="n">iy</span><span class="o">++</span><span class="p">)</span>
+<h2 id="Solution-4:">Solution 4:<a name="solution4" /><a class="anchor-link" href="#Solution-4:">&#182;</a></h2><p>Include NVSHMEM headers</p>
+<div class="highlight"><pre><span></span><span class="cp">#include</span> <span class="cpf">&lt;nvshmem.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span> <span class="cpf">&lt;nvshmemx.h&gt;</span><span class="cp"></span>
+</pre></div>
+<p>and initalize NVSHMEM with MPI</p>
+<div class="highlight"><pre><span></span><span class="n">MPI_Comm</span> <span class="n">mpi_comm</span> <span class="o">=</span> <span class="n">MPI_COMM_WORLD</span><span class="p">;</span>
+<span class="n">nvshmemx_init_attr_t</span> <span class="n">attr</span><span class="p">;</span>
+<span class="n">attr</span><span class="p">.</span><span class="n">mpi_comm</span> <span class="o">=</span> <span class="o">&amp;</span><span class="n">mpi_comm</span><span class="p">;</span>
+<span class="n">nvshmemx_init_attr</span><span class="p">(</span><span class="n">NVSHMEMX_INIT_WITH_MPI_COMM</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">attr</span><span class="p">);</span>
+</pre></div>
+<p>Allocate device memory and map it top the host allocation for OpenACC</p>
+<div class="highlight"><pre><span></span><span class="n">real</span> <span class="o">*</span><span class="n">d_A</span> <span class="o">=</span> <span class="p">(</span><span class="n">real</span> <span class="o">*</span><span class="p">)</span><span class="n">nvshmem_malloc</span><span class="p">(</span><span class="n">nx</span> <span class="o">*</span> <span class="n">ny</span> <span class="o">*</span> <span class="k">sizeof</span><span class="p">(</span><span class="n">real</span><span class="p">));</span>
+<span class="n">map</span><span class="p">(</span><span class="n">A</span><span class="p">,</span> <span class="n">d_A</span><span class="p">,</span> <span class="n">nx</span> <span class="o">*</span> <span class="n">ny</span> <span class="o">*</span> <span class="k">sizeof</span><span class="p">(</span><span class="n">real</span><span class="p">));</span>
+</pre></div>
+<p>Calculate the right locations on the remote GPUs and communicate data</p>
+<div class="highlight"><pre><span></span><span class="c1">// Periodic boundary conditions</span>
+<span class="kt">int</span> <span class="n">top</span> <span class="o">=</span> <span class="p">(</span><span class="n">rank</span> <span class="o">==</span> <span class="mi">0</span><span class="p">)</span> <span class="o">?</span> <span class="p">(</span><span class="n">size</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">:</span> <span class="n">rank</span> <span class="o">-</span> <span class="mi">1</span><span class="p">;</span>
+<span class="kt">int</span> <span class="n">bottom</span> <span class="o">=</span> <span class="p">(</span><span class="n">rank</span> <span class="o">==</span> <span class="p">(</span><span class="n">size</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span> <span class="o">?</span> <span class="mi">0</span> <span class="o">:</span> <span class="n">rank</span> <span class="o">+</span> <span class="mi">1</span><span class="p">;</span>
+<span class="kt">int</span> <span class="n">iy_start_top</span> <span class="o">=</span> <span class="n">top</span> <span class="o">*</span> <span class="n">chunk_size</span><span class="p">;</span>
+<span class="kt">int</span> <span class="n">iy_end_top</span> <span class="o">=</span> <span class="n">iy_start_top</span> <span class="o">+</span> <span class="n">chunk_size</span><span class="p">;</span>
+
+<span class="c1">// Do not process boundaries</span>
+<span class="n">iy_start_top</span> <span class="o">=</span> <span class="n">max</span><span class="p">(</span><span class="n">iy_start_top</span><span class="p">,</span> <span class="mi">1</span><span class="p">);</span>
+<span class="n">iy_end_top</span> <span class="o">=</span> <span class="n">min</span><span class="p">(</span><span class="n">iy_end_top</span><span class="p">,</span> <span class="n">ny</span> <span class="o">-</span> <span class="mi">1</span><span class="p">);</span>
+
+<span class="kt">int</span> <span class="n">iy_start_bottom</span> <span class="o">=</span> <span class="n">bottom</span> <span class="o">*</span> <span class="n">chunk_size</span><span class="p">;</span>
+<span class="kt">int</span> <span class="n">iy_end_bottom</span> <span class="o">=</span> <span class="n">iy_start_bottom</span> <span class="o">+</span> <span class="n">chunk_size</span><span class="p">;</span>
+
+<span class="c1">// Do not process boundaries</span>
+<span class="n">iy_start_bottom</span> <span class="o">=</span> <span class="n">max</span><span class="p">(</span><span class="n">iy_start_bottom</span><span class="p">,</span> <span class="mi">1</span><span class="p">);</span>
+<span class="n">iy_end_bottom</span> <span class="o">=</span> <span class="n">min</span><span class="p">(</span><span class="n">iy_end_bottom</span><span class="p">,</span> <span class="n">ny</span> <span class="o">-</span> <span class="mi">1</span><span class="p">);</span>
+
+<span class="c1">// Halo exchange</span>
+<span class="cp">#pragma acc host_data use_device(A)</span>
 <span class="p">{</span>
-    <span class="k">for</span><span class="p">(</span> <span class="kt">int</span> <span class="n">ix</span> <span class="o">=</span> <span class="n">ix_start</span><span class="p">;</span> <span class="n">ix</span> <span class="o">&lt;</span> <span class="n">ix_end</span><span class="p">;</span> <span class="n">ix</span><span class="o">++</span> <span class="p">)</span>
-    <span class="p">{</span>
-        <span class="n">Anew</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">=</span> <span class="o">-</span><span class="mf">0.25</span> <span class="o">*</span> <span class="p">(</span><span class="n">rhs</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">-</span> <span class="p">(</span> <span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="o">+</span><span class="mi">1</span><span class="p">]</span> <span class="o">+</span> <span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
-                                               <span class="o">+</span> <span class="n">A</span><span class="p">[(</span><span class="n">iy</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">+</span> <span class="n">A</span><span class="p">[(</span><span class="n">iy</span><span class="o">+</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="p">));</span>
-        <span class="n">error</span> <span class="o">=</span> <span class="n">fmaxr</span><span class="p">(</span> <span class="n">error</span><span class="p">,</span> <span class="n">fabsr</span><span class="p">(</span><span class="n">Anew</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span><span class="o">-</span><span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]));</span>
-    <span class="p">}</span>
+    <span class="kt">double</span> <span class="n">start_mpi</span> <span class="o">=</span> <span class="n">MPI_Wtime</span><span class="p">();</span>
+    <span class="n">nvshmem_double_put</span><span class="p">((</span><span class="kt">double</span> <span class="o">*</span><span class="p">)(</span><span class="n">A</span> <span class="o">+</span> <span class="n">iy_end_top</span> <span class="o">*</span> <span class="n">nx</span> <span class="o">+</span> <span class="n">ix_start</span><span class="p">),</span>
+                       <span class="p">(</span><span class="kt">double</span> <span class="o">*</span><span class="p">)(</span><span class="n">A</span> <span class="o">+</span> <span class="n">iy_start</span> <span class="o">*</span> <span class="n">nx</span> <span class="o">+</span> <span class="n">ix_start</span><span class="p">),</span> <span class="p">(</span><span class="n">ix_end</span> <span class="o">-</span> <span class="n">ix_start</span><span class="p">),</span> <span class="n">top</span><span class="p">);</span>
+    <span class="n">nvshmem_double_put</span><span class="p">((</span><span class="kt">double</span> <span class="o">*</span><span class="p">)(</span><span class="n">A</span> <span class="o">+</span> <span class="p">(</span><span class="n">iy_start_bottom</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="n">nx</span> <span class="o">+</span> <span class="n">ix_start</span><span class="p">),</span>
+                       <span class="p">(</span><span class="kt">double</span> <span class="o">*</span><span class="p">)(</span><span class="n">A</span> <span class="o">+</span> <span class="p">(</span><span class="n">iy_end</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="n">nx</span> <span class="o">+</span> <span class="n">ix_start</span><span class="p">),</span> <span class="p">(</span><span class="n">ix_end</span> <span class="o">-</span> <span class="n">ix_start</span><span class="p">),</span>
+                       <span class="n">bottom</span><span class="p">);</span>
+    <span class="n">nvshmem_barrier_all</span><span class="p">();</span>
+    <span class="n">mpi_time</span> <span class="o">+=</span> <span class="n">MPI_Wtime</span><span class="p">()</span> <span class="o">-</span> <span class="n">start_mpi</span><span class="p">;</span>
 <span class="p">}</span>
 </pre></div>
+<p>Finally, remember to deallocate:</p>
+<div class="highlight"><pre><span></span><span class="n">nvshmem_free</span><span class="p">(</span><span class="n">d_A</span><span class="p">);</span>
+</pre></div>
 <h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
-<li><a href="/edit/C/task1/poisson2d.solution.c">C Version</a></li>
-<li><a href="/edit/FORTRAN/task1/poisson2d.solution.F03">Fortran Version</a></li>
-</ul>
-<h4 id="File-browser">File browser<a class="anchor-link" href="#File-browser">&#182;</a></h4><p>Can be used to open source files, Makefiles, profiling output.</p>
-<ul>
-<li><a href="/tree/C/task1/">C Version</a></li>
-<li><a href="/tree/FORTRAN/task1/">Fortran Version</a></li>
+<li><a href="./C/task4/poisson2d.solution.c">C Version</a></li>
 </ul>
 
 </div>
@@ -14098,17 +14803,18 @@ mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof .
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Compiling,-Running-and-Profiling">Compiling, Running and Profiling<a class="anchor-link" href="#Compiling,-Running-and-Profiling">&#182;</a></h4><p>You can compile, run and profile the solution with the next cells.  <strong>After</strong> the profiling finished the output file <code>poisson2d.solution.pgprof</code>  can be downloaded from here: <a href="/tree/C/task1/pgprof.poisson2d.Task1.solution.tar.gz?download=1">C Version</a> / <a href="/tree/FORTRAN/task1/pgprof.poisson2d.Task1.solution.tar.gz?download=1">Fortran Version</a>.</p>
+<h4 id="Compiling,-Running-and-Profiling">Compiling, Running and Profiling<a class="anchor-link" href="#Compiling,-Running-and-Profiling">&#182;</a></h4><p>You can compile, run and profile the solution with the next cells.  You can profile the code by executing the next cell. <strong>After</strong> the profiling completed download the tarball containing the profiles (<code>pgprof.Task4.solution.poisson2d.tar.gz</code>) with the File Browser. 
+Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu. Remember to use the <em>Multiple processes</em> option in the assistant.</p>
 
 </div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[26]:</div>
+<div class="prompt input_prompt">In&nbsp;[22]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task1
+<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task4
 </pre></div>
 
     </div>
@@ -14125,7 +14831,7 @@ mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof .
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>/autofs/nccsopen-svm1_home/mathiasw/sc17task/C/task1
+<pre>/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task4
 </pre>
 </div>
 </div>
@@ -14136,10 +14842,10 @@ mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof .
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[27]:</div>
+<div class="prompt input_prompt">In&nbsp;[23]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task1&#39;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task4&#39;</span><span class="p">)</span>
 <span class="o">!</span>make poisson2d.solution
 </pre></div>
 
@@ -14157,7 +14863,50 @@ mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof .
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>make: `poisson2d.solution&#39; is up to date.
+<pre>mpicxx -c -DUSE_DOUBLE  -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d_serial.c -o poisson2d_serial.o
+poisson2d_serial(int, int, double, double *, double *, int, int, const double *):
+     37, Generating present(Anew[:],rhs[:],Aref[:])
+     39, Generating update device(rhs[:ny*nx],Aref[:ny*nx])
+     40, Generating Tesla code
+         43, #pragma acc loop gang /* blockIdx.x */
+         44, #pragma acc loop vector(128) /* threadIdx.x */
+         49, Generating implicit reduction(max:error)
+     44, Loop is parallelizable
+     51, Generating Tesla code
+         54, #pragma acc loop gang /* blockIdx.x */
+         55, #pragma acc loop vector(128) /* threadIdx.x */
+     55, Loop is parallelizable
+     58, Generating Tesla code
+         62, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+     65, Generating Tesla code
+         67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+     77, Generating update self(Aref[:ny*nx])
+mpicxx -DUSE_DOUBLE  -Minfo=accel -fast -acc -ta=tesla:cc70,pinned -I/gpfs/wolf/trn003/world-shared/software/nvshmem//include poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution -L/gpfs/wolf/trn003/world-shared/software/nvshmem//lib -lnvshmem -Mcuda -lcuda -lrt 
+poisson2d.solution.c:
+main:
+     90, Generating enter data create(Aref[:ny*nx],rhs[:ny*nx],A[:ny*nx],Anew[:ny*nx])
+    101, Generating present(Aref[:],A[:])
+         Generating Tesla code
+        105, #pragma acc loop gang /* blockIdx.x */
+        106, #pragma acc loop vector(128) /* threadIdx.x */
+    106, Loop is parallelizable
+    162, Generating update device(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)],rhs[nx*iy_start:nx*(iy_end-iy_start)])
+    163, Generating present(A[:],rhs[:],Anew[:])
+         Generating Tesla code
+        166, #pragma acc loop gang /* blockIdx.x */
+        167, #pragma acc loop vector(128) /* threadIdx.x */
+        171, Generating implicit reduction(max:error)
+    167, Loop is parallelizable
+    177, Generating present(Anew[:],A[:])
+         Generating Tesla code
+        180, #pragma acc loop gang /* blockIdx.x */
+        181, #pragma acc loop vector(128) /* threadIdx.x */
+    181, Loop is parallelizable
+    214, Generating present(A[:])
+         Generating Tesla code
+        217, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+    227, Generating update self(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)])
+    246, Generating exit data delete(rhs[:1],Aref[:1],A[:1],Anew[:1])
 </pre>
 </div>
 </div>
@@ -14168,11 +14917,11 @@ mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof .
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[28]:</div>
+<div class="prompt input_prompt">In&nbsp;[24]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task1&#39;</span><span class="p">)</span>
-<span class="o">!</span>make run.solution
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task4&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run.solution
 </pre></div>
 
     </div>
@@ -14189,24 +14938,37 @@ mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof .
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS ./poisson2d.solution
-Job &lt;4699&gt; is submitted to default queue &lt;batch&gt;.
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; ./poisson2d.solution
+Job &lt;25207&gt; is submitted to default queue &lt;batch&gt;.
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
-Jacobi relaxation Calculation: 2048 x 2048 mesh
-Calculate reference solution and time serial CPU execution.
-    0, 0.249999
-  100, 0.249760
-  200, 0.249522
-  300, 0.249285
-  400, 0.249048
-GPU execution.
-    0, 0.249999
-  100, 0.249760
-  200, 0.249522
-  300, 0.249285
-  400, 0.249048
-2048x2048: 1 CPU:   5.5086 s, 1 GPU:   0.2293 s, speedup:    24.02
+WARN: IB HCA and GPU are not connected to a PCIe switch so IB performance can be limited depending on the CPU generation 
+Jacobi relaxation Calculation: 4096 x 4096 mesh
+Calculate reference solution and time serial execution.
+    0, 0.250000
+  100, 0.249940
+  200, 0.249880
+  300, 0.249821
+  400, 0.249761
+  500, 0.249702
+  600, 0.249642
+  700, 0.249583
+  800, 0.249524
+  900, 0.249464
+Parallel execution.
+    0, 0.250000
+  100, 0.249940
+  200, 0.249880
+  300, 0.249821
+  400, 0.249761
+  500, 0.249702
+  600, 0.249642
+  700, 0.249583
+  800, 0.249524
+  900, 0.249464
+Num GPUs: 2.
+4096x4096: 1 GPU:   1.3171 s, 2 GPUs:   0.7377 s, speedup:     1.79, efficiency:    89.27%
+MPI time:   0.0686 s, inter GPU BW:     1.78 GiB/s
 </pre>
 </div>
 </div>
@@ -14217,11 +14979,11 @@ GPU execution.
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[29]:</div>
+<div class="prompt input_prompt">In&nbsp;[25]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task1&#39;</span><span class="p">)</span>
-<span class="o">!</span>make profile.solution
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task4&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make profile.solution
 </pre></div>
 
     </div>
@@ -14238,50 +15000,27 @@ GPU execution.
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off  -o /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.timeline.pgprof ./poisson2d.solution 3
-Job &lt;4700&gt; is submitted to default queue &lt;batch&gt;.
-&lt;&lt;Waiting for dispatch ...&gt;&gt;
-&lt;&lt;Starting on login1&gt;&gt;
-==78449== PGPROF is profiling process 78449, command: ./poisson2d.solution 3
-==78449== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.timeline.pgprof
-Jacobi relaxation Calculation: 2048 x 2048 mesh
-Calculate reference solution and time serial CPU execution.
-    0, 0.249999
-GPU execution.
-    0, 0.249999
-2048x2048: 1 CPU:   0.0476 s, 1 GPU:   0.0190 s, speedup:     2.51
-bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off  --analysis-metrics -o /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.metrics.pgprof ./poisson2d.solution 3
-Job &lt;4701&gt; is submitted to default queue &lt;batch&gt;.
-&lt;&lt;Waiting for dispatch ...&gt;&gt;
-&lt;&lt;Starting on login1&gt;&gt;
-==33668== PGPROF is profiling process 33668, command: ./poisson2d.solution 3
-==33668== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
-==33668== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.metrics.pgprof
-Jacobi relaxation Calculation: 2048 x 2048 mesh
-Calculate reference solution and time serial CPU execution.
-    0, 0.249999
-GPU execution.
-    0, 0.249999
-2048x2048: 1 CPU:   0.0490 s, 1 GPU:  15.6526 s, speedup:     0.00
-bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off  --metrics gld_efficiency,gst_efficiency -o /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.efficiency.pgprof ./poisson2d.solution 3
-Job &lt;4702&gt; is submitted to default queue &lt;batch&gt;.
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task4.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+Job &lt;25208&gt; is submitted to default queue &lt;batch&gt;.
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
-==78646== PGPROF is profiling process 78646, command: ./poisson2d.solution 3
-==78646== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
-==78646== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.efficiency.pgprof
-Jacobi relaxation Calculation: 2048 x 2048 mesh
-Calculate reference solution and time serial CPU execution.
-    0, 0.249999
-GPU execution.
-    0, 0.249999
-2048x2048: 1 CPU:   0.0489 s, 1 GPU:   0.6829 s, speedup:     0.07
-pgprof --csv -i /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.efficiency.pgprof 2&gt;&amp;1 | grep -v &#34;======&#34; &gt; poisson2d.solution.efficiency.csv
-mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.*.pgprof .
-tar -cvzf pgprof.poisson2d.Task1.solution.tar.gz  poisson2d.solution.*.pgprof
-poisson2d.solution.efficiency.pgprof
-poisson2d.solution.metrics.pgprof
-poisson2d.solution.timeline.pgprof
+==93971== PGPROF is profiling process 93971, command: ./poisson2d.solution 10
+==93970== PGPROF is profiling process 93970, command: ./poisson2d.solution 10
+==93971== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task4.NP2.0.pgprof
+WARN: IB HCA and GPU are not connected to a PCIe switch so IB performance can be limited depending on the CPU generation 
+Jacobi relaxation Calculation: 4096 x 4096 mesh
+Calculate reference solution and time serial execution.
+    0, 0.250000
+Parallel execution.
+    0, 0.250000
+Num GPUs: 2.
+4096x4096: 1 GPU:   0.0225 s, 2 GPUs:   0.0132 s, speedup:     1.71, efficiency:    85.34%
+MPI time:   0.0010 s, inter GPU BW:     1.24 GiB/s
+==93970== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task4.NP2.1.pgprof
+mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task4.NP2.?.pgprof  .
+tar -cvzf pgprof.poisson2d.Task4.solution.tar.gz poisson2d.solution.Task4.NP2.?.pgprof
+poisson2d.solution.Task4.NP2.0.pgprof
+poisson2d.solution.Task4.NP2.1.pgprof
 </pre>
 </div>
 </div>
@@ -14293,19 +15032,27 @@ poisson2d.solution.timeline.pgprof
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>For the <em>Global Memory Load/Store Efficiency</em> the <code>make profile</code> command also generated a CSV file that you can import and view with the cell below.<br>
-If you purely work in a terminal you can view the same output by running <code>pgprof -i poisson2d.efficiency.solution.pgprof</code>.</p>
+<h4 id="Scaling">Scaling<a class="anchor-link" href="#Scaling">&#182;</a></h4><p>You can do a simple scaling run for up to all 6 GPUs in the node by executing the next cell.</p>
 
 </div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[30]:</div>
+<div class="prompt input_prompt">In&nbsp;[26]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">data_frame_solution</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;poisson2d.solution.efficiency.csv&#39;</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">&#39;,&#39;</span><span class="p">)</span>
-<span class="n">data_frame_solution</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task4&#39;</span><span class="p">)</span>
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">1</span> make run.solution <span class="p">|</span> grep speedup &gt; scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt; scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">4</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt;  scale.out
+<span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">6</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt;  scale.out
+<span class="n">data_frameS4</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;scale.out&#39;</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
+
+<span class="o">!</span>rm scale.out
+
+<span class="n">data_frameS4b</span><span class="o">=</span><span class="n">data_frameS4</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,[</span><span class="mi">5</span><span class="p">,</span><span class="mi">7</span><span class="p">,</span><span class="mi">10</span><span class="p">,</span><span class="mi">12</span><span class="p">]]</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
+<span class="n">data_frameS4b</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="mi">5</span><span class="p">:</span><span class="s1">&#39;GPUs&#39;</span><span class="p">,</span> <span class="mi">7</span><span class="p">:</span> <span class="s1">&#39;time [s]&#39;</span><span class="p">,</span> <span class="mi">10</span><span class="p">:</span><span class="s1">&#39;speedup&#39;</span><span class="p">,</span> <span class="mi">12</span><span class="p">:</span><span class="s1">&#39;efficiency&#39;</span><span class="p">})</span>
 </pre></div>
 
     </div>
@@ -14318,7 +15065,25 @@ If you purely work in a terminal you can view the same output by running <code>p
 
 <div class="output_area">
 
-    <div class="prompt output_prompt">Out[30]:</div>
+    <div class="prompt"></div>
+
+
+<div class="output_subarea output_stream output_stdout output_text">
+<pre>&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+&lt;&lt;Waiting for dispatch ...&gt;&gt;
+&lt;&lt;Starting on login1&gt;&gt;
+</pre>
+</div>
+</div>
+
+<div class="output_area">
+
+    <div class="prompt output_prompt">Out[26]:</div>
 
 
 
@@ -14341,148 +15106,40 @@ If you purely work in a terminal you can view the same output by running <code>p
   <thead>
     <tr style="text-align: right;">
       <th></th>
-      <th>Device</th>
-      <th>Kernel</th>
-      <th>Invocations</th>
-      <th>Metric Name</th>
-      <th>Metric Description</th>
-      <th>Min</th>
-      <th>Max</th>
-      <th>Avg</th>
+      <th>GPUs</th>
+      <th>time [s]</th>
+      <th>speedup</th>
+      <th>efficiency</th>
     </tr>
   </thead>
-  <tbody>
-    <tr>
-      <th>0</th>
-      <td>Tesla V100-SXM2-16GB (0)</td>
-      <td>main_70_gpu</td>
-      <td>1</td>
-      <td>gld_efficiency</td>
-      <td>Global Memory Load Efficiency</td>
-      <td>0.000000%</td>
-      <td>0.000000%</td>
-      <td>0.000000%</td>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>1</td>
+      <td>1.3685</td>
+      <td>0.96,</td>
+      <td>96.08%</td>
     </tr>
     <tr>
       <th>1</th>
-      <td>Tesla V100-SXM2-16GB (0)</td>
-      <td>main_70_gpu</td>
-      <td>1</td>
-      <td>gst_efficiency</td>
-      <td>Global Memory Store Efficiency</td>
-      <td>100.000000%</td>
-      <td>100.000000%</td>
-      <td>100.000000%</td>
+      <td>2</td>
+      <td>0.7472</td>
+      <td>1.78,</td>
+      <td>88.90%</td>
     </tr>
     <tr>
       <th>2</th>
-      <td>Tesla V100-SXM2-16GB (0)</td>
-      <td>main_95_gpu</td>
-      <td>3</td>
-      <td>gld_efficiency</td>
-      <td>Global Memory Load Efficiency</td>
-      <td>91.879935%</td>
-      <td>91.897053%</td>
-      <td>91.888339%</td>
+      <td>4</td>
+      <td>0.4605</td>
+      <td>2.85,</td>
+      <td>71.27%</td>
     </tr>
     <tr>
       <th>3</th>
-      <td>Tesla V100-SXM2-16GB (0)</td>
-      <td>main_95_gpu</td>
-      <td>3</td>
-      <td>gst_efficiency</td>
-      <td>Global Memory Store Efficiency</td>
-      <td>88.845486%</td>
-      <td>88.845486%</td>
-      <td>88.845486%</td>
-    </tr>
-    <tr>
-      <th>4</th>
-      <td>Tesla V100-SXM2-16GB (0)</td>
-      <td>main_102_gpu__red</td>
-      <td>3</td>
-      <td>gld_efficiency</td>
-      <td>Global Memory Load Efficiency</td>
-      <td>99.756335%</td>
-      <td>99.756335%</td>
-      <td>99.756335%</td>
-    </tr>
-    <tr>
-      <th>5</th>
-      <td>Tesla V100-SXM2-16GB (0)</td>
-      <td>main_102_gpu__red</td>
-      <td>3</td>
-      <td>gst_efficiency</td>
-      <td>Global Memory Store Efficiency</td>
-      <td>25.000000%</td>
-      <td>25.000000%</td>
-      <td>25.000000%</td>
-    </tr>
-    <tr>
-      <th>6</th>
-      <td>Tesla V100-SXM2-16GB (0)</td>
-      <td>main_122_gpu</td>
-      <td>3</td>
-      <td>gld_efficiency</td>
-      <td>Global Memory Load Efficiency</td>
-      <td>25.000000%</td>
-      <td>25.000000%</td>
-      <td>25.000000%</td>
-    </tr>
-    <tr>
-      <th>7</th>
-      <td>Tesla V100-SXM2-16GB (0)</td>
-      <td>main_122_gpu</td>
-      <td>3</td>
-      <td>gst_efficiency</td>
-      <td>Global Memory Store Efficiency</td>
-      <td>25.000000%</td>
-      <td>25.000000%</td>
-      <td>25.000000%</td>
-    </tr>
-    <tr>
-      <th>8</th>
-      <td>Tesla V100-SXM2-16GB (0)</td>
-      <td>main_106_gpu</td>
-      <td>3</td>
-      <td>gld_efficiency</td>
-      <td>Global Memory Load Efficiency</td>
-      <td>91.823101%</td>
-      <td>91.890100%</td>
-      <td>91.851075%</td>
-    </tr>
-    <tr>
-      <th>9</th>
-      <td>Tesla V100-SXM2-16GB (0)</td>
-      <td>main_106_gpu</td>
-      <td>3</td>
-      <td>gst_efficiency</td>
-      <td>Global Memory Store Efficiency</td>
-      <td>88.956522%</td>
-      <td>88.956522%</td>
-      <td>88.956522%</td>
-    </tr>
-    <tr>
-      <th>10</th>
-      <td>Tesla V100-SXM2-16GB (0)</td>
-      <td>main_116_gpu</td>
-      <td>3</td>
-      <td>gld_efficiency</td>
-      <td>Global Memory Load Efficiency</td>
-      <td>94.722222%</td>
-      <td>94.722222%</td>
-      <td>94.722222%</td>
-    </tr>
-    <tr>
-      <th>11</th>
-      <td>Tesla V100-SXM2-16GB (0)</td>
-      <td>main_116_gpu</td>
-      <td>3</td>
-      <td>gst_efficiency</td>
-      <td>Global Memory Store Efficiency</td>
-      <td>88.956522%</td>
-      <td>88.956522%</td>
-      <td>88.956522%</td>
+      <td>6</td>
+      <td>0.3612</td>
+      <td>3.60,</td>
+      <td>60.05%</td>
     </tr>
   </tbody>
 </table>
@@ -14498,6 +15155,8 @@ If you purely work in a terminal you can view the same output by running <code>p
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
+<p>The communication using NVSHMEM and the barrier executed as a kernel on the device can be seen in the profiler, e.g. as shown below.</p>
+<p><img src="./resources/Solution4.png" alt="Solution4.png"></p>
 <p><a href="#top">Back to Top</a></p>
 <hr>
 
@@ -14507,57 +15166,26 @@ If you purely work in a terminal you can view the same output by running <code>p
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h2 id="Solution-2:">Solution 2:<a name="solution2" /><a class="anchor-link" href="#Solution-2:">&#182;</a></h2><p>Set the GPU used by the rank using <code>#pragma acc set device_num</code></p>
-<div class="highlight"><pre><span></span><span class="c1">//Initialize MPI and determine rank and size</span>
-<span class="n">MPI_Init</span><span class="p">(</span><span class="o">&amp;</span><span class="n">argc</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">argv</span><span class="p">);</span>
-<span class="n">MPI_Comm_rank</span><span class="p">(</span><span class="n">MPI_COMM_WORLD</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">rank</span><span class="p">);</span>
-<span class="n">MPI_Comm_size</span><span class="p">(</span><span class="n">MPI_COMM_WORLD</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">size</span><span class="p">);</span>
-
-<span class="cp">#pragma acc set device_num( rank )</span>
-
-<span class="n">real</span><span class="o">*</span> <span class="kr">restrict</span> <span class="k">const</span> <span class="n">A</span>    <span class="o">=</span> <span class="p">(</span><span class="n">real</span><span class="o">*</span><span class="p">)</span> <span class="n">malloc</span><span class="p">(</span><span class="n">nx</span><span class="o">*</span><span class="n">ny</span><span class="o">*</span><span class="k">sizeof</span><span class="p">(</span><span class="n">real</span><span class="p">));</span>
-<span class="n">real</span><span class="o">*</span> <span class="kr">restrict</span> <span class="k">const</span> <span class="n">Aref</span> <span class="o">=</span> <span class="p">(</span><span class="n">real</span><span class="o">*</span><span class="p">)</span> <span class="n">malloc</span><span class="p">(</span><span class="n">nx</span><span class="o">*</span><span class="n">ny</span><span class="o">*</span><span class="k">sizeof</span><span class="p">(</span><span class="n">real</span><span class="p">));</span>
-<span class="n">real</span><span class="o">*</span> <span class="kr">restrict</span> <span class="k">const</span> <span class="n">Anew</span> <span class="o">=</span> <span class="p">(</span><span class="n">real</span><span class="o">*</span><span class="p">)</span> <span class="n">malloc</span><span class="p">(</span><span class="n">nx</span><span class="o">*</span><span class="n">ny</span><span class="o">*</span><span class="k">sizeof</span><span class="p">(</span><span class="n">real</span><span class="p">));</span>
-<span class="n">real</span><span class="o">*</span> <span class="kr">restrict</span> <span class="k">const</span> <span class="n">rhs</span>  <span class="o">=</span> <span class="p">(</span><span class="n">real</span><span class="o">*</span><span class="p">)</span> <span class="n">malloc</span><span class="p">(</span><span class="n">nx</span><span class="o">*</span><span class="n">ny</span><span class="o">*</span><span class="k">sizeof</span><span class="p">(</span><span class="n">real</span><span class="p">));</span>
-</pre></div>
-<p>Apply domain decomposition</p>
-<div class="highlight"><pre><span></span><span class="c1">// Ensure correctness if ny%size != 0</span>
-<span class="kt">int</span> <span class="n">chunk_size</span> <span class="o">=</span> <span class="n">ceil</span><span class="p">(</span> <span class="p">(</span><span class="mf">1.0</span><span class="o">*</span><span class="n">ny</span><span class="p">)</span><span class="o">/</span><span class="n">size</span> <span class="p">);</span>
-
-<span class="kt">int</span> <span class="n">iy_start</span> <span class="o">=</span> <span class="n">rank</span> <span class="o">*</span> <span class="n">chunk_size</span><span class="p">;</span>
-<span class="kt">int</span> <span class="n">iy_end</span>   <span class="o">=</span> <span class="n">iy_start</span> <span class="o">+</span> <span class="n">chunk_size</span><span class="p">;</span>
-
-<span class="c1">// Do not process boundaries</span>
-<span class="n">iy_start</span> <span class="o">=</span> <span class="n">max</span><span class="p">(</span> <span class="n">iy_start</span><span class="p">,</span> <span class="mi">1</span> <span class="p">);</span>
-<span class="n">iy_end</span> <span class="o">=</span> <span class="n">min</span><span class="p">(</span> <span class="n">iy_end</span><span class="p">,</span> <span class="n">ny</span> <span class="o">-</span> <span class="mi">1</span> <span class="p">);</span>
-</pre></div>
-<p>Exchange data</p>
-<div class="highlight"><pre><span></span><span class="c1">//Periodic boundary conditions</span>
-<span class="kt">int</span> <span class="n">top</span>    <span class="o">=</span> <span class="p">(</span><span class="n">rank</span> <span class="o">==</span> <span class="mi">0</span><span class="p">)</span> <span class="o">?</span> <span class="p">(</span><span class="n">size</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">:</span> <span class="n">rank</span><span class="o">-</span><span class="mi">1</span><span class="p">;</span>
-<span class="kt">int</span> <span class="n">bottom</span> <span class="o">=</span> <span class="p">(</span><span class="n">rank</span> <span class="o">==</span> <span class="p">(</span><span class="n">size</span><span class="o">-</span><span class="mi">1</span><span class="p">))</span> <span class="o">?</span> <span class="mi">0</span> <span class="o">:</span> <span class="n">rank</span><span class="o">+</span><span class="mi">1</span><span class="p">;</span>
-<span class="cp">#pragma acc host_data use_device( A )</span>
+<h2 id="Solution-5:">Solution 5:<a name="solution5" /><a class="anchor-link" href="#Solution-5:">&#182;</a></h2><p>Basically all kernels in the <code>while</code> loop can use the async keyword. Please take a look in the solution source code. They will all use the OpenACC default async queue.</p>
+<p>To also place the halo exchange in the queue use:</p>
+<div class="highlight"><pre><span></span><span class="cp">#pragma acc host_data use_device(A)</span>
 <span class="p">{</span>
-    <span class="kt">double</span> <span class="n">start_mpi</span> <span class="o">=</span> <span class="n">MPI_Wtime</span><span class="p">();</span>
-    <span class="c1">//1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom</span>
-    <span class="n">MPI_Sendrecv</span><span class="p">(</span> <span class="n">A</span><span class="o">+</span><span class="n">iy_start</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span> <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">top</span>   <span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
-                  <span class="n">A</span><span class="o">+</span><span class="n">iy_end</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span>   <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">bottom</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
-                  <span class="n">MPI_COMM_WORLD</span><span class="p">,</span> <span class="n">MPI_STATUS_IGNORE</span> <span class="p">);</span>
-
-    <span class="c1">//2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top</span>
-    <span class="n">MPI_Sendrecv</span><span class="p">(</span> <span class="n">A</span><span class="o">+</span><span class="p">(</span><span class="n">iy_end</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span>   <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">bottom</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
-                  <span class="n">A</span><span class="o">+</span><span class="p">(</span><span class="n">iy_start</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span> <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">top</span>   <span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
-                  <span class="n">MPI_COMM_WORLD</span><span class="p">,</span> <span class="n">MPI_STATUS_IGNORE</span> <span class="p">);</span>
-    <span class="n">mpi_time</span> <span class="o">+=</span> <span class="n">MPI_Wtime</span><span class="p">()</span> <span class="o">-</span> <span class="n">start_mpi</span><span class="p">;</span>
+    <span class="n">nvshmemx_double_put_on_stream</span><span class="p">(</span>
+        <span class="p">(</span><span class="kt">double</span> <span class="o">*</span><span class="p">)(</span><span class="n">A</span> <span class="o">+</span> <span class="n">iy_end_top</span> <span class="o">*</span> <span class="n">nx</span> <span class="o">+</span> <span class="n">ix_start</span><span class="p">),</span>
+        <span class="p">(</span><span class="kt">double</span> <span class="o">*</span><span class="p">)(</span><span class="n">A</span> <span class="o">+</span> <span class="n">iy_start</span> <span class="o">*</span> <span class="n">nx</span> <span class="o">+</span> <span class="n">ix_start</span><span class="p">),</span> <span class="p">(</span><span class="n">ix_end</span> <span class="o">-</span> <span class="n">ix_start</span><span class="p">),</span> <span class="n">top</span><span class="p">,</span>
+        <span class="p">(</span><span class="n">cudaStream_t</span><span class="p">)</span><span class="n">acc_get_cuda_stream</span><span class="p">(</span><span class="n">acc_get_default_async</span><span class="p">()));</span>
+    <span class="n">nvshmemx_double_put_on_stream</span><span class="p">(</span>
+        <span class="p">(</span><span class="kt">double</span> <span class="o">*</span><span class="p">)(</span><span class="n">A</span> <span class="o">+</span> <span class="p">(</span><span class="n">iy_start_bottom</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="n">nx</span> <span class="o">+</span> <span class="n">ix_start</span><span class="p">),</span>
+        <span class="p">(</span><span class="kt">double</span> <span class="o">*</span><span class="p">)(</span><span class="n">A</span> <span class="o">+</span> <span class="p">(</span><span class="n">iy_end</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="n">nx</span> <span class="o">+</span> <span class="n">ix_start</span><span class="p">),</span> <span class="p">(</span><span class="n">ix_end</span> <span class="o">-</span> <span class="n">ix_start</span><span class="p">),</span> <span class="n">bottom</span><span class="p">,</span>
+        <span class="p">(</span><span class="n">cudaStream_t</span><span class="p">)</span><span class="n">acc_get_cuda_stream</span><span class="p">(</span><span class="n">acc_get_default_async</span><span class="p">()));</span>
 <span class="p">}</span>
+<span class="n">nvshmemx_barrier_all_on_stream</span><span class="p">((</span><span class="n">cudaStream_t</span><span class="p">)</span><span class="n">acc_get_cuda_stream</span><span class="p">(</span><span class="n">acc_get_default_async</span><span class="p">()));</span>
+</pre></div>
+<p>Finally when copying out data make sure to wait on all device computation first:</p>
+<div class="highlight"><pre><span></span><span class="cp">#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx]) wait</span>
 </pre></div>
 <h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
-<li><a href="/edit/C/task2/poisson2d.solution.c">C Version</a></li>
-<li><a href="/edit/FORTRAN/task2/poisson2d.solution.F03">Fortran Version</a></li>
-</ul>
-<h4 id="File-browser">File browser<a class="anchor-link" href="#File-browser">&#182;</a></h4><p>Can be used to open source files, Makefiles, profiling output.</p>
-<ul>
-<li><a href="/tree/C/task2/">C Version</a></li>
-<li><a href="/tree/FORTRAN/task2/">Fortran Version</a></li>
+<li><a href="/edit/C/task5/poisson2d.solution.c">C Version</a></li>
 </ul>
 
 </div>
@@ -14566,17 +15194,18 @@ If you purely work in a terminal you can view the same output by running <code>p
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Compiling,-Running-and-Profiling">Compiling, Running and Profiling<a class="anchor-link" href="#Compiling,-Running-and-Profiling">&#182;</a></h4><p>You can compile, run and profile the solution with the next cells.  <strong>After</strong> the profiling finished the output file <code>poisson2d.solution.pgprof</code>  can be downloaded from here: <a href="/tree/C/task2/pgprof.poisson2d.Task2.solution.tar.gz?download=1">C Version</a> / <a href="/tree/FORTRAN/task2/pgprof.poisson2d.Task2.solution.tar.gz?download=1">Fortran Version</a>.</p>
+<h4 id="Compiling,-Running-and-Profiling">Compiling, Running and Profiling<a class="anchor-link" href="#Compiling,-Running-and-Profiling">&#182;</a></h4><p>You can compile, run and profile the solution with the next cells.  You can profile the code by executing the next cell. <strong>After</strong> the profiling completed download the tarball containing the profiles (<code>pgprof.Task5.solution.poisson2d.tar.gz</code>) with the File Browser. 
+Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu. Remember to use the <em>Multiple processes</em> option in the assistant.</p>
 
 </div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[31]:</div>
+<div class="prompt input_prompt">In&nbsp;[27]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task2
+<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task5
 </pre></div>
 
     </div>
@@ -14593,7 +15222,7 @@ If you purely work in a terminal you can view the same output by running <code>p
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>/autofs/nccsopen-svm1_home/mathiasw/sc17task/C/task2p
+<pre>/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task5
 </pre>
 </div>
 </div>
@@ -14604,10 +15233,10 @@ If you purely work in a terminal you can view the same output by running <code>p
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[32]:</div>
+<div class="prompt input_prompt">In&nbsp;[28]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task2&#39;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task5&#39;</span><span class="p">)</span>
 <span class="o">!</span>make poisson2d.solution
 </pre></div>
 
@@ -14625,7 +15254,50 @@ If you purely work in a terminal you can view the same output by running <code>p
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>make: `poisson2d.solution&#39; is up to date.
+<pre>mpicxx -c -DUSE_DOUBLE  -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d_serial.c -o poisson2d_serial.o
+poisson2d_serial(int, int, double, double *, double *, int, int, const double *):
+     37, Generating present(Anew[:],rhs[:],Aref[:])
+     39, Generating update device(rhs[:ny*nx],Aref[:ny*nx])
+     40, Generating Tesla code
+         43, #pragma acc loop gang /* blockIdx.x */
+         44, #pragma acc loop vector(128) /* threadIdx.x */
+         49, Generating implicit reduction(max:error)
+     44, Loop is parallelizable
+     51, Generating Tesla code
+         54, #pragma acc loop gang /* blockIdx.x */
+         55, #pragma acc loop vector(128) /* threadIdx.x */
+     55, Loop is parallelizable
+     58, Generating Tesla code
+         62, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+     65, Generating Tesla code
+         67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+     77, Generating update self(Aref[:ny*nx])
+mpicxx -DUSE_DOUBLE  -Minfo=accel -fast -acc -ta=tesla:cc70,pinned -I/ccsopen/home/mathiasw/nvshmem-master/build/include poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution -L/ccsopen/home/mathiasw/nvshmem-master/build/lib -lnvshmem -Mcuda -lcuda -lrt 
+poisson2d.solution.c:
+main:
+     90, Generating enter data create(Aref[:ny*nx],rhs[:ny*nx],A[:ny*nx],Anew[:ny*nx])
+    101, Generating present(Aref[:],A[:])
+         Generating Tesla code
+        105, #pragma acc loop gang /* blockIdx.x */
+        106, #pragma acc loop vector(128) /* threadIdx.x */
+    106, Loop is parallelizable
+    137, Generating update device(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)],rhs[nx*iy_start:nx*(iy_end-iy_start)])
+    138, Generating present(A[:],rhs[:],Anew[:])
+         Generating Tesla code
+        141, #pragma acc loop gang /* blockIdx.x */
+        142, #pragma acc loop vector(128) /* threadIdx.x */
+        146, Generating implicit reduction(max:error)
+    142, Loop is parallelizable
+    152, Generating present(Anew[:],A[:])
+         Generating Tesla code
+        155, #pragma acc loop gang /* blockIdx.x */
+        156, #pragma acc loop vector(128) /* threadIdx.x */
+    156, Loop is parallelizable
+    190, Generating present(A[:])
+         Generating Tesla code
+        193, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+    203, Generating update self(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)])
+    221, Generating exit data delete(rhs[:1],Aref[:1],A[:1],Anew[:1])
 </pre>
 </div>
 </div>
@@ -14636,10 +15308,10 @@ If you purely work in a terminal you can view the same output by running <code>p
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[33]:</div>
+<div class="prompt input_prompt">In&nbsp;[29]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task2&#39;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task5&#39;</span><span class="p">)</span>
 <span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run.solution
 </pre></div>
 
@@ -14657,10 +15329,11 @@ If you purely work in a terminal you can view the same output by running <code>p
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; ./poisson2d.solution
-Job &lt;4703&gt; is submitted to default queue &lt;batch&gt;.
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; ./poisson2d.solution
+Job &lt;25213&gt; is submitted to default queue &lt;batch&gt;.
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
+WARN: IB HCA and GPU are not connected to a PCIe switch so IB performance can be limited depending on the CPU generation 
 Jacobi relaxation Calculation: 4096 x 4096 mesh
 Calculate reference solution and time serial execution.
     0, 0.250000
@@ -14685,8 +15358,7 @@ Parallel execution.
   800, 0.249524
   900, 0.249464
 Num GPUs: 2.
-4096x4096: 1 GPU:   1.3294 s, 2 GPUs:   0.7305 s, speedup:     1.82, efficiency:    91.00%
-MPI time:   0.0558 s, inter GPU BW:     2.19 GiB/s
+4096x4096: 1 GPU:   1.3176 s, 2 GPUs:   0.6777 s, speedup:     1.94, efficiency:    97.22%
 </pre>
 </div>
 </div>
@@ -14697,10 +15369,10 @@ MPI time:   0.0558 s, inter GPU BW:     2.19 GiB/s
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[34]:</div>
+<div class="prompt input_prompt">In&nbsp;[30]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task2&#39;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task5&#39;</span><span class="p">)</span>
 <span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make profile.solution
 </pre></div>
 
@@ -14718,26 +15390,26 @@ MPI time:   0.0558 s, inter GPU BW:     2.19 GiB/s
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; pgprof -f --cpu-profiling off --annotate-mpi openmpi -o /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task2.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
-Job &lt;4704&gt; is submitted to default queue &lt;batch&gt;.
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task5.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+Job &lt;25214&gt; is submitted to default queue &lt;batch&gt;.
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
-==33912== PGPROF is profiling process 33912, command: ./poisson2d.solution 10
-==33913== PGPROF is profiling process 33913, command: ./poisson2d.solution 10
-==33912== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task2.NP2.1.pgprof
-==33913== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task2.NP2.0.pgprof
+==94705== PGPROF is profiling process 94705, command: ./poisson2d.solution 10
+==94707== PGPROF is profiling process 94707, command: ./poisson2d.solution 10
+==94707== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task5.NP2.1.pgprof
+WARN: IB HCA and GPU are not connected to a PCIe switch so IB performance can be limited depending on the CPU generation 
 Jacobi relaxation Calculation: 4096 x 4096 mesh
 Calculate reference solution and time serial execution.
     0, 0.250000
 Parallel execution.
     0, 0.250000
 Num GPUs: 2.
-4096x4096: 1 GPU:   0.0233 s, 2 GPUs:   0.0142 s, speedup:     1.64, efficiency:    82.17%
-MPI time:   0.0008 s, inter GPU BW:     1.62 GiB/s
-mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task2.NP2.?.pgprof  .
-tar -cvzf pgprof.poisson2d.Task2.solution.tar.gz poisson2d.solution.Task2.NP2.?.pgprof
-poisson2d.solution.Task2.NP2.0.pgprof
-poisson2d.solution.Task2.NP2.1.pgprof
+4096x4096: 1 GPU:   0.0225 s, 2 GPUs:   0.0117 s, speedup:     1.92, efficiency:    96.05%
+==94705== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task5.NP2.0.pgprof
+mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task5.NP2.?.pgprof  .
+tar -cvzf pgprof.poisson2d.Task5.solution.tar.gz poisson2d.solution.Task5.NP2.?.pgprof
+poisson2d.solution.Task5.NP2.0.pgprof
+poisson2d.solution.Task5.NP2.1.pgprof
 </pre>
 </div>
 </div>
@@ -14756,20 +15428,20 @@ poisson2d.solution.Task2.NP2.1.pgprof
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[35]:</div>
+<div class="prompt input_prompt">In&nbsp;[31]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task2&#39;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task5&#39;</span><span class="p">)</span>
 <span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">1</span> make run.solution <span class="p">|</span> grep speedup &gt; scale.out
 <span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt; scale.out
 <span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">4</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt;  scale.out
 <span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">6</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt;  scale.out
-<span class="n">data_frameS2</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;scale.out&#39;</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
+<span class="n">data_frameS5</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;scale.out&#39;</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
 
 <span class="o">!</span>rm scale.out
 
-<span class="n">data_frameS2b</span><span class="o">=</span><span class="n">data_frameS2</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,[</span><span class="mi">5</span><span class="p">,</span><span class="mi">7</span><span class="p">,</span><span class="mi">10</span><span class="p">,</span><span class="mi">12</span><span class="p">]]</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
-<span class="n">data_frameS2b</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="mi">5</span><span class="p">:</span><span class="s1">&#39;GPUs&#39;</span><span class="p">,</span> <span class="mi">7</span><span class="p">:</span> <span class="s1">&#39;time [s]&#39;</span><span class="p">,</span> <span class="mi">10</span><span class="p">:</span><span class="s1">&#39;speedup&#39;</span><span class="p">,</span> <span class="mi">12</span><span class="p">:</span><span class="s1">&#39;efficiency&#39;</span><span class="p">})</span>
+<span class="n">data_frameS5b</span><span class="o">=</span><span class="n">data_frameS5</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,[</span><span class="mi">5</span><span class="p">,</span><span class="mi">7</span><span class="p">,</span><span class="mi">10</span><span class="p">,</span><span class="mi">12</span><span class="p">]]</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
+<span class="n">data_frameS5b</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="mi">5</span><span class="p">:</span><span class="s1">&#39;GPUs&#39;</span><span class="p">,</span> <span class="mi">7</span><span class="p">:</span> <span class="s1">&#39;time [s]&#39;</span><span class="p">,</span> <span class="mi">10</span><span class="p">:</span><span class="s1">&#39;speedup&#39;</span><span class="p">,</span> <span class="mi">12</span><span class="p">:</span><span class="s1">&#39;efficiency&#39;</span><span class="p">})</span>
 </pre></div>
 
     </div>
@@ -14800,7 +15472,7 @@ poisson2d.solution.Task2.NP2.1.pgprof
 
 <div class="output_area">
 
-    <div class="prompt output_prompt">Out[35]:</div>
+    <div class="prompt output_prompt">Out[31]:</div>
 
 
 
@@ -14833,30 +15505,30 @@ poisson2d.solution.Task2.NP2.1.pgprof
     <tr>
       <th>0</th>
       <td>1</td>
-      <td>1.4007</td>
-      <td>0.94,</td>
-      <td>94.02%</td>
+      <td>1.2915</td>
+      <td>1.02,</td>
+      <td>101.63%</td>
     </tr>
     <tr>
       <th>1</th>
       <td>2</td>
-      <td>0.7482</td>
-      <td>1.77,</td>
-      <td>88.38%</td>
+      <td>0.6742</td>
+      <td>1.96,</td>
+      <td>98.08%</td>
     </tr>
     <tr>
       <th>2</th>
       <td>4</td>
-      <td>0.4536</td>
-      <td>2.94,</td>
-      <td>73.56%</td>
+      <td>0.3801</td>
+      <td>3.47,</td>
+      <td>86.66%</td>
     </tr>
     <tr>
       <th>3</th>
       <td>6</td>
-      <td>0.3480</td>
-      <td>3.78,</td>
-      <td>62.95%</td>
+      <td>0.2733</td>
+      <td>4.80,</td>
+      <td>80.04%</td>
     </tr>
   </tbody>
 </table>
@@ -14872,6 +15544,8 @@ poisson2d.solution.Task2.NP2.1.pgprof
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
+<p>The asynchronous execution and execution in the same stream can be seen in the profiler, e.g. as shown below.</p>
+<p><img src="./resources/Solution5.png" alt="Solution5.png"></p>
 <p><a href="#top">Back to Top</a></p>
 <hr>
 
@@ -14881,51 +15555,8 @@ poisson2d.solution.Task2.NP2.1.pgprof
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h2 id="Solution-3:">Solution 3:<a name="solution3" /><a class="anchor-link" href="#Solution-3:">&#182;</a></h2><p>Update the boundaries first.</p>
-<div class="highlight"><pre><span></span><span class="cp">#pragma acc parallel loop present(A,Anew)</span>
-<span class="k">for</span><span class="p">(</span> <span class="kt">int</span> <span class="n">ix</span> <span class="o">=</span> <span class="n">ix_start</span><span class="p">;</span> <span class="n">ix</span> <span class="o">&lt;</span> <span class="n">ix_end</span><span class="p">;</span> <span class="n">ix</span><span class="o">++</span> <span class="p">)</span>
-<span class="p">{</span>
-    <span class="n">A</span><span class="p">[(</span><span class="n">iy_start</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">=</span> <span class="n">Anew</span><span class="p">[(</span><span class="n">iy_start</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">];</span>
-    <span class="n">A</span><span class="p">[(</span><span class="n">iy_end</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">=</span> <span class="n">Anew</span><span class="p">[(</span><span class="n">iy_end</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">];</span>
-<span class="p">}</span>
-</pre></div>
-<p>Start the interior loop asynchronously so it can overlap with the MPI communication and wait at the end for the completion.</p>
-<div class="highlight"><pre><span></span><span class="cp">#pragma acc parallel loop present(A,Anew) async</span>
-<span class="k">for</span> <span class="p">(</span><span class="kt">int</span> <span class="n">iy</span> <span class="o">=</span> <span class="n">iy_start</span><span class="o">+</span><span class="mi">1</span><span class="p">;</span> <span class="n">iy</span> <span class="o">&lt;</span> <span class="n">iy_end</span><span class="o">-</span><span class="mi">1</span><span class="p">;</span> <span class="n">iy</span><span class="o">++</span><span class="p">)</span>
-<span class="p">{</span>
-    <span class="k">for</span><span class="p">(</span> <span class="kt">int</span> <span class="n">ix</span> <span class="o">=</span> <span class="n">ix_start</span><span class="p">;</span> <span class="n">ix</span> <span class="o">&lt;</span> <span class="n">ix_end</span><span class="p">;</span> <span class="n">ix</span><span class="o">++</span> <span class="p">)</span>
-    <span class="p">{</span>
-        <span class="n">A</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">]</span> <span class="o">=</span> <span class="n">Anew</span><span class="p">[</span><span class="n">iy</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix</span><span class="p">];</span>
-    <span class="p">}</span>
-<span class="p">}</span>
-
-<span class="c1">//Periodic boundary conditions</span>
-<span class="kt">int</span> <span class="n">top</span>    <span class="o">=</span> <span class="p">(</span><span class="n">rank</span> <span class="o">==</span> <span class="mi">0</span><span class="p">)</span> <span class="o">?</span> <span class="p">(</span><span class="n">size</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">:</span> <span class="n">rank</span><span class="o">-</span><span class="mi">1</span><span class="p">;</span>
-<span class="kt">int</span> <span class="n">bottom</span> <span class="o">=</span> <span class="p">(</span><span class="n">rank</span> <span class="o">==</span> <span class="p">(</span><span class="n">size</span><span class="o">-</span><span class="mi">1</span><span class="p">))</span> <span class="o">?</span> <span class="mi">0</span> <span class="o">:</span> <span class="n">rank</span><span class="o">+</span><span class="mi">1</span><span class="p">;</span>
-<span class="cp">#pragma acc host_data use_device( A )</span>
-<span class="p">{</span>
-    <span class="kt">double</span> <span class="n">start_mpi</span> <span class="o">=</span> <span class="n">MPI_Wtime</span><span class="p">();</span>
-    <span class="c1">//1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom</span>
-    <span class="n">MPI_Sendrecv</span><span class="p">(</span> <span class="n">A</span><span class="o">+</span><span class="n">iy_start</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span> <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">top</span>   <span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
-                  <span class="n">A</span><span class="o">+</span><span class="n">iy_end</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span>   <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">bottom</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
-                  <span class="n">MPI_COMM_WORLD</span><span class="p">,</span> <span class="n">MPI_STATUS_IGNORE</span> <span class="p">);</span>
-
-    <span class="c1">//2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top</span>
-    <span class="n">MPI_Sendrecv</span><span class="p">(</span> <span class="n">A</span><span class="o">+</span><span class="p">(</span><span class="n">iy_end</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span>   <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">bottom</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
-                  <span class="n">A</span><span class="o">+</span><span class="p">(</span><span class="n">iy_start</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">nx</span><span class="o">+</span><span class="n">ix_start</span><span class="p">,</span> <span class="p">(</span><span class="n">ix_end</span><span class="o">-</span><span class="n">ix_start</span><span class="p">),</span> <span class="n">MPI_REAL_TYPE</span><span class="p">,</span> <span class="n">top</span>   <span class="p">,</span> <span class="mi">0</span><span class="p">,</span>
-                  <span class="n">MPI_COMM_WORLD</span><span class="p">,</span> <span class="n">MPI_STATUS_IGNORE</span> <span class="p">);</span>
-    <span class="n">mpi_time</span> <span class="o">+=</span> <span class="n">MPI_Wtime</span><span class="p">()</span> <span class="o">-</span> <span class="n">start_mpi</span><span class="p">;</span>
-<span class="p">}</span>
-<span class="cp">#pragma acc wait</span>
-</pre></div>
-<h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
-<li><a href="/edit/C/task3/poisson2d.solution.c">C Version</a></li>
-<li><a href="/edit/FORTRAN/task3/poisson2d.solution.F03">Fortran Version</a></li>
-</ul>
-<h4 id="File-browser">File browser<a class="anchor-link" href="#File-browser">&#182;</a></h4><p>Can be used to open source files, Makefiles, profiling output.</p>
-<ul>
-<li><a href="/tree/C/task3/">C Version</a></li>
-<li><a href="/tree/FORTRAN/task3/">Fortran Version</a></li>
+<h2 id="Solution-6:-TODO">Solution 6:<a name="solution6" /> TODO<a class="anchor-link" href="#Solution-6:-TODO">&#182;</a></h2><h4 id="Code">Code<a class="anchor-link" href="#Code">&#182;</a></h4><ul>
+<li><a href="./C/task6/poisson2d.solution.c">C Version</a></li>
 </ul>
 
 </div>
@@ -14934,17 +15565,18 @@ poisson2d.solution.Task2.NP2.1.pgprof
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h4 id="Compiling,-Running-and-Profiling">Compiling, Running and Profiling<a class="anchor-link" href="#Compiling,-Running-and-Profiling">&#182;</a></h4><p>You can compile, run and profile the solution with the next cells.  <strong>After</strong> the profiling finished the output file <code>poisson2d.solution.pgprof</code>  can be downloaded from here: <a href="/tree/C/task3/pgprof.poisson2d.Task3.solution.tar.gz?download=1">C Version</a> / <a href="/tree/FORTRAN/task3/pgprof.poisson2d.Task3.solution.tar.gz?download=1">Fortran Version</a>.</p>
+<h4 id="Compiling,-Running-and-Profiling">Compiling, Running and Profiling<a class="anchor-link" href="#Compiling,-Running-and-Profiling">&#182;</a></h4><p>You can compile, run and profile the solution with the next cells.  You can profile the code by executing the next cell. <strong>After</strong> the profiling completed download the tarball containing the profiles (<code>pgprof.Task6.solution.poisson2d.tar.gz</code>) with the File Browser. 
+Then you can import them into pgprof / nvvp using the <em>Import</em> option in the <em>File</em> menu. Remember to use the <em>Multiple processes</em> option in the assistant.</p>
 
 </div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[36]:</div>
+<div class="prompt input_prompt">In&nbsp;[32]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task3
+<div class=" highlight hl-ipython3"><pre><span></span><span class="o">%</span><span class="k">cd</span> $basedir/task6
 </pre></div>
 
     </div>
@@ -14961,7 +15593,7 @@ poisson2d.solution.Task2.NP2.1.pgprof
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>/autofs/nccsopen-svm1_home/mathiasw/sc17task/C/task3p
+<pre>/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task6
 </pre>
 </div>
 </div>
@@ -14972,10 +15604,10 @@ poisson2d.solution.Task2.NP2.1.pgprof
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[37]:</div>
+<div class="prompt input_prompt">In&nbsp;[33]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task3&#39;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task6&#39;</span><span class="p">)</span>
 <span class="o">!</span>make poisson2d.solution
 </pre></div>
 
@@ -14993,7 +15625,50 @@ poisson2d.solution.Task2.NP2.1.pgprof
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>make: `poisson2d.solution&#39; is up to date.
+<pre>mpicxx -c -DUSE_DOUBLE  -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d_serial.c -o poisson2d_serial.o
+poisson2d_serial(int, int, double, double *, double *, int, int, const double *):
+     37, Generating present(Anew[:],rhs[:],Aref[:])
+     39, Generating update device(rhs[:ny*nx],Aref[:ny*nx])
+     40, Generating Tesla code
+         43, #pragma acc loop gang /* blockIdx.x */
+         44, #pragma acc loop vector(128) /* threadIdx.x */
+         49, Generating implicit reduction(max:error)
+     44, Loop is parallelizable
+     51, Generating Tesla code
+         54, #pragma acc loop gang /* blockIdx.x */
+         55, #pragma acc loop vector(128) /* threadIdx.x */
+     55, Loop is parallelizable
+     58, Generating Tesla code
+         62, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+     65, Generating Tesla code
+         67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+     77, Generating update self(Aref[:ny*nx])
+mpicxx -DUSE_DOUBLE  -Minfo=accel -fast -acc -ta=tesla:cc70,pinned -I/ccsopen/home/mathiasw/nvshmem-master/build/include poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution -L/ccsopen/home/mathiasw/nvshmem-master/build/lib -lnvshmem -Mcuda -lcuda -lrt 
+poisson2d.solution.c:
+main:
+     95, Generating enter data create(Aref[:ny*nx],rhs[:ny*nx],A[:ny*nx],Anew[:ny*nx])
+    106, Generating present(Aref[:],A[:])
+         Generating Tesla code
+        110, #pragma acc loop gang /* blockIdx.x */
+        111, #pragma acc loop vector(128) /* threadIdx.x */
+    111, Loop is parallelizable
+    159, Generating update device(rhs[nx*iy_start:nx*(iy_end-iy_start)],A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)])
+    160, Generating present(A[:],rhs[:],Anew[:])
+         Generating Tesla code
+        165, #pragma acc loop gang /* blockIdx.x */
+        166, #pragma acc loop vector(128) /* threadIdx.x */
+        170, Generating implicit reduction(max:error)
+    166, Loop is parallelizable
+    176, Generating present(Anew[:],A[:])
+         Generating Tesla code
+        179, #pragma acc loop gang /* blockIdx.x */
+        181, #pragma acc loop vector(128) /* threadIdx.x */
+    181, Loop is parallelizable
+    192, Generating present(A[:])
+         Generating Tesla code
+        195, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
+    205, Generating update self(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)])
+    224, Generating exit data delete(rhs[:1],Aref[:1],A[:1],Anew[:1])
 </pre>
 </div>
 </div>
@@ -15004,10 +15679,10 @@ poisson2d.solution.Task2.NP2.1.pgprof
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[38]:</div>
+<div class="prompt input_prompt">In&nbsp;[34]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task3&#39;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task6&#39;</span><span class="p">)</span>
 <span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run.solution
 </pre></div>
 
@@ -15025,10 +15700,11 @@ poisson2d.solution.Task2.NP2.1.pgprof
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; ./poisson2d.solution
-Job &lt;4709&gt; is submitted to default queue &lt;batch&gt;.
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; ./poisson2d.solution
+Job &lt;25219&gt; is submitted to default queue &lt;batch&gt;.
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
+WARN: IB HCA and GPU are not connected to a PCIe switch so IB performance can be limited depending on the CPU generation 
 Jacobi relaxation Calculation: 4096 x 4096 mesh
 Calculate reference solution and time serial execution.
     0, 0.250000
@@ -15053,8 +15729,8 @@ Parallel execution.
   800, 0.249524
   900, 0.249464
 Num GPUs: 2.
-4096x4096: 1 GPU:   1.3417 s, 2 GPUs:   0.7025 s, speedup:     1.91, efficiency:    95.50%
-MPI time:   0.0658 s, inter GPU BW:     1.86 GiB/s
+4096x4096: 1 GPU:   1.3157 s, 2 GPUs:   0.6533 s, speedup:     2.01, efficiency:   100.70%
+MPI time:   0.0000 s, inter GPU BW:      inf GiB/s
 </pre>
 </div>
 </div>
@@ -15065,10 +15741,10 @@ MPI time:   0.0658 s, inter GPU BW:     1.86 GiB/s
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[39]:</div>
+<div class="prompt input_prompt">In&nbsp;[35]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task3&#39;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task6&#39;</span><span class="p">)</span>
 <span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make profile.solution
 </pre></div>
 
@@ -15086,26 +15762,27 @@ MPI time:   0.0658 s, inter GPU BW:     1.86 GiB/s
 
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; pgprof -f --cpu-profiling off --annotate-mpi openmpi -o /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task3.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
-Job &lt;4710&gt; is submitted to default queue &lt;batch&gt;.
+<pre>bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs &#34;-gpu&#34; pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task6.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10
+Job &lt;25220&gt; is submitted to default queue &lt;batch&gt;.
 &lt;&lt;Waiting for dispatch ...&gt;&gt;
 &lt;&lt;Starting on login1&gt;&gt;
-==34328== PGPROF is profiling process 34328, command: ./poisson2d.solution 10
-==34327== PGPROF is profiling process 34327, command: ./poisson2d.solution 10
-==34328== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task3.NP2.1.pgprof
+==95445== PGPROF is profiling process 95445, command: ./poisson2d.solution 10
+==95446== PGPROF is profiling process 95446, command: ./poisson2d.solution 10
+==95445== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task6.NP2.1.pgprof
+WARN: IB HCA and GPU are not connected to a PCIe switch so IB performance can be limited depending on the CPU generation 
 Jacobi relaxation Calculation: 4096 x 4096 mesh
 Calculate reference solution and time serial execution.
     0, 0.250000
 Parallel execution.
     0, 0.250000
 Num GPUs: 2.
-4096x4096: 1 GPU:   0.0234 s, 2 GPUs:   0.0135 s, speedup:     1.74, efficiency:    86.82%
-MPI time:   0.0009 s, inter GPU BW:     1.29 GiB/s
-==34327== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task3.NP2.0.pgprof
-mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task3.NP2.?.pgprof  .
-tar -cvzf pgprof.poisson2d.Task3.solution.tar.gz poisson2d.solution.Task3.NP2.?.pgprof
-poisson2d.solution.Task3.NP2.0.pgprof
-poisson2d.solution.Task3.NP2.1.pgprof
+4096x4096: 1 GPU:   0.0225 s, 2 GPUs:   0.0116 s, speedup:     1.94, efficiency:    96.85%
+MPI time:   0.0000 s, inter GPU BW:      inf GiB/s
+==95446== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task6.NP2.0.pgprof
+mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task6.NP2.?.pgprof  .
+tar -cvzf pgprof.poisson2d.Task6.solution.tar.gz poisson2d.solution.Task6.NP2.?.pgprof
+poisson2d.solution.Task6.NP2.0.pgprof
+poisson2d.solution.Task6.NP2.1.pgprof
 </pre>
 </div>
 </div>
@@ -15124,20 +15801,20 @@ poisson2d.solution.Task3.NP2.1.pgprof
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
-<div class="prompt input_prompt">In&nbsp;[40]:</div>
+<div class="prompt input_prompt">In&nbsp;[36]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task3&#39;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">checkdir</span><span class="p">(</span><span class="s1">&#39;task6&#39;</span><span class="p">)</span>
 <span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">1</span> make run.solution <span class="p">|</span> grep speedup &gt; scale.out
 <span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">2</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt; scale.out
 <span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">4</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt;  scale.out
 <span class="o">!</span><span class="nv">NP</span><span class="o">=</span><span class="m">6</span> make run.solution <span class="p">|</span> grep speedup &gt;&gt;  scale.out
-<span class="n">data_frameS3</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;scale.out&#39;</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
+<span class="n">data_frameS5</span> <span class="o">=</span> <span class="n">pandas</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;scale.out&#39;</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
 
 <span class="o">!</span>rm scale.out
 
-<span class="n">data_frameS3b</span><span class="o">=</span><span class="n">data_frameS3</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,[</span><span class="mi">5</span><span class="p">,</span><span class="mi">7</span><span class="p">,</span><span class="mi">10</span><span class="p">,</span><span class="mi">12</span><span class="p">]]</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
-<span class="n">data_frameS3b</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="mi">5</span><span class="p">:</span><span class="s1">&#39;GPUs&#39;</span><span class="p">,</span> <span class="mi">7</span><span class="p">:</span> <span class="s1">&#39;time [s]&#39;</span><span class="p">,</span> <span class="mi">10</span><span class="p">:</span><span class="s1">&#39;speedup&#39;</span><span class="p">,</span> <span class="mi">12</span><span class="p">:</span><span class="s1">&#39;efficiency&#39;</span><span class="p">})</span>
+<span class="n">data_frameS5b</span><span class="o">=</span><span class="n">data_frameS5</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,[</span><span class="mi">5</span><span class="p">,</span><span class="mi">7</span><span class="p">,</span><span class="mi">10</span><span class="p">,</span><span class="mi">12</span><span class="p">]]</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
+<span class="n">data_frameS5b</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="mi">5</span><span class="p">:</span><span class="s1">&#39;GPUs&#39;</span><span class="p">,</span> <span class="mi">7</span><span class="p">:</span> <span class="s1">&#39;time [s]&#39;</span><span class="p">,</span> <span class="mi">10</span><span class="p">:</span><span class="s1">&#39;speedup&#39;</span><span class="p">,</span> <span class="mi">12</span><span class="p">:</span><span class="s1">&#39;efficiency&#39;</span><span class="p">})</span>
 </pre></div>
 
     </div>
@@ -15168,7 +15845,7 @@ poisson2d.solution.Task3.NP2.1.pgprof
 
 <div class="output_area">
 
-    <div class="prompt output_prompt">Out[40]:</div>
+    <div class="prompt output_prompt">Out[36]:</div>
 
 
 
@@ -15201,30 +15878,30 @@ poisson2d.solution.Task3.NP2.1.pgprof
     <tr>
       <th>0</th>
       <td>1</td>
-      <td>1.3711</td>
-      <td>0.96,</td>
-      <td>96.37%</td>
+      <td>1.2869</td>
+      <td>1.02,</td>
+      <td>102.05%</td>
     </tr>
     <tr>
       <th>1</th>
       <td>2</td>
-      <td>0.7171</td>
-      <td>1.86,</td>
-      <td>92.90%</td>
+      <td>0.6574</td>
+      <td>1.99,</td>
+      <td>99.26%</td>
     </tr>
     <tr>
       <th>2</th>
       <td>4</td>
-      <td>0.4104</td>
-      <td>3.21,</td>
-      <td>80.16%</td>
+      <td>0.3670</td>
+      <td>3.59,</td>
+      <td>89.71%</td>
     </tr>
     <tr>
       <th>3</th>
       <td>6</td>
-      <td>0.2966</td>
-      <td>4.47,</td>
-      <td>74.47%</td>
+      <td>0.2450</td>
+      <td>5.37,</td>
+      <td>89.42%</td>
     </tr>
   </tbody>
 </table>
@@ -15240,16 +15917,9 @@ poisson2d.solution.Task3.NP2.1.pgprof
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>The overlap of compute and communication can be seen in the profiler, e.g. as shown below.</p>
-<p><img src="Solution3.png" alt="Solution3.png"></p>
-
-</div>
-</div>
-</div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<hr>
+<p>The missing of device copies can be seen in the profiler, e.g. as shown below.</p>
+<p><img src="./resources/Solution6.png" alt="Solution6.png"></p>
+<p><a href="#top">Back to Top</a></p>
 <hr>
 
 </div>
@@ -15258,24 +15928,12 @@ poisson2d.solution.Task3.NP2.1.pgprof
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<h1 id="Survey">Survey<a name="survey" /><a class="anchor-link" href="#Survey">&#182;</a></h1><p>Please rememeber to take some time and fill out the survey <a href="http://bit.ly/sc18-eval">http://bit.ly/sc18-eval</a>.
-<img src="eval.png" alt="eval.png"></p>
-
-</div>
-</div>
-</div>
-<div class="cell border-box-sizing code_cell rendered">
-<div class="input">
-<div class="prompt input_prompt">In&nbsp;[&nbsp;]:</div>
-<div class="inner_cell">
-    <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span> 
-</pre></div>
+<hr>
+<h1 id="Survey">Survey<a name="survey" /><a class="anchor-link" href="#Survey">&#182;</a></h1><p>Please remember to take some time and fill out the survey<a href="http://bit.ly/sc19-eval">http://bit.ly/sc19-eval</a>.</p>
+<p><img src="./resources/eval.png" alt="eval.png"></p>
 
-    </div>
 </div>
 </div>
-
 </div>
     </div>
   </div>
diff --git a/4-GPU/HandsOn/Solution/HandsOnGPUProgramming_Solution.ipynb b/4-GPU/HandsOn/Solution/HandsOnGPUProgramming_Solution.ipynb
index 664f541d9f948f99a5d8ee4c357e962516d4f857..b9e8b1a138a072731a8fee8b8acd7c8c85e4dc79 100644
--- a/4-GPU/HandsOn/Solution/HandsOnGPUProgramming_Solution.ipynb
+++ b/4-GPU/HandsOn/Solution/HandsOnGPUProgramming_Solution.ipynb
@@ -4,57 +4,57 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Solutions: Hands-On GPU Programming\n",
-    "_Supercomputing 2018 Tutorial \"Application Porting and Optimization on GPU-Accelerated POWER Architectures\", November 12th 2018_\n",
+    "# Hands-On GPU Programming\n",
+    "_Supercomputing 2019 Tutorial \"Application Porting and Optimization on GPU-Accelerated POWER Architectures\", November 18th 2019_\n",
     "\n",
     "---"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "### Read me first\n",
-    "\n",
-    "**This contains the output for the solutions. It is for illustrative purpose only and not suitable for execution.**   The solutions are described in the solution section. If you run this as a jupyter notebook from the `Solutions` directory links to the solution source files and solution profiles should work. For the _html_ and _pdf_ versions please navigate to the corresponding directory to find the solution profiles and sources.\n",
-    "\n",
-    "Skip ahead to the [Solutions](#solutions)\n",
-    "\n",
-    "\n",
-    "---\n",
-    "---\n",
-    "\n",
-    "This tutorial is primarily designed to be executed as a _jupyter_ notebook. However, everything can also be done using an _ssh_ connection to _ascent.olcf.ornl.gov_ in your terminal.\n",
+    "## Solutions \n",
     "\n",
-    "#### Jupyter notebook execution\n",
+    "**This contains the output for the solutions.**\n",
     "\n",
-    "When using jupyter this notebook will guide you through the step. Note that if you execute a cell multiple times while optimizing the code the output will be replaced. You can however duplicate the cell you want to execute and keep its output. Check the _edit_ menu above.\n",
+    "The solutions are described in the solution section. The directory links to the solution source files should work though. For the _html_ and _pdf_ versions please navigate to the corresponding directory to find the solution profiles and sources.\n",
     "\n",
-    "You will always find links to a file browser of the corresponding task subdirectory as well as direct links to the source files you will need to edit as well as the profiling output you need to open locally.\n",
     "\n",
-    "If you want you also can get a [terminal](/terminals/4) in your browser.\n",
+    "### GPU Programming\n",
     "\n",
-    "#### Terminal fallback\n",
-    "The tasks are placed in directories named `[C/FORTRAN]/task[0-3]`.\n",
-    "\n",
-    "The files you will need to edit are always the `poisson2d.(C|F03)` files.\n",
+    "* [Solution 0](#solution0) Accelerate a CPU Jacobi solver with OpenACC relying on Unified Memory for data movement using `–ta=tesla:managed`  \n",
+    "  \n",
     "\n",
-    "The makefile targets execute everything to compile, run and profile the code. Please take a look at the cells containing the make calls as a guide.\n",
+    "* [Solution 1](#solution1) Fix memory access pattern of OpenACC accelerated Jacobi Solver  \n",
+    "  \n",
     "\n",
-    "The outputs of profiling runs be placed in the working directory of the current task and are named like `*.pgprof` or `pgprof.*.tar.gz` in case of multiple files. You can use _scp/sftp_ to transfer files to your machine and for viewing them in pgprof/nvprof.\n",
+    "### Multi-GPU with MPI\n",
     "\n",
-    "#### Viewing profiles in the NVIDIA Visual Profiler / PGI Profiler\n",
+    "* [Solution 2](#solution2) Use MPI to make OpenACC accelerated Jacobi Solver scale to multiple GPUs  \n",
+    "  \n",
     "\n",
-    "The profiles generated _pgprof / nvprof_ should be viewed on your local machine. You can install the PGI Community Edition (pgprof) or the NVIDIA CUDA Toolkit on your notebook (Windows, Mac, Linux). You don't need an NVIDIA GPU in your machine to use the profiler GUI.\n",
+    "* [Solution 3](#solution3) Hide MPI communication time by overlapping communication and \n",
+    "\tcomputation in a MPI+OpenACC multi GPU Jacobi Solver  \n",
+    "  \n",
+    "  \n",
+    "  \n",
+    "### Multi-GPU with NVSHMEM *(Advanced -- C only)*\n",
+    "  \n",
+    "* [Solution 4](#solution4) Use NVSHMEM instead of MPI  \n",
+    "  \n",
     "\n",
-    "There are USB Sticks in the room that contain the installers for various platforms, but for reference you can also download it from:\n",
-    "* [NVIDIA CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)\n",
-    "* [PGI Community Edition](https://www.pgroup.com/products/community.htm)\n",
+    "* [Solution 5](#solution5) Put NVSHMEM calls on stream to hide API calls and GPU/CPU synchronization  \n",
+    "  \n",
     "\n",
-    "After downloading the profiler output (more infos below) follow the steps outlined in:\n",
-    "* [Import Session](https://docs.nvidia.com/cuda/profiler-users-guide/index.html#import-session)\n",
+    "### Survey\n",
+    " \n",
+    " * [Suvery](#survey) Please remember to take the survey !\n",
     "\n",
-    "In case there is confusion: The PGI Profiler is a slightly modified version (different default settings) of the NVIDIA Visual Profiler. So you can use any of the two to view profiles. "
+    "---\n",
+    "---"
    ]
   },
   {
@@ -69,12 +69,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T16:08:18.286482Z",
-     "start_time": "2018-11-06T16:08:18.275563Z"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -98,6 +93,7 @@
     "if(not rootdir):\n",
     "    rootdir=%pwd\n",
     "basedir=os.path.join(rootdir,LANGUAGE)\n",
+    "basedirC=os.path.join(rootdir,'C')\n",
     "\n",
     "print (\"You selected {} for the exercises.\".format(LANGUAGE))\n",
     "\n",
@@ -110,180 +106,231 @@
     "    for t in range(4):\n",
     "        d='%s/task%i'%(basedir,t)\n",
     "        %cd $d\n",
-    "        !make clean"
+    "        !make clean\n",
+    "        \n",
+    "#cleanall()"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task0\n"
+     ]
+    }
+   ],
    "source": [
-    "# Tasks<a name=\"top\"></a>\n",
-    "\n",
-    "This session comes with multiple tasks. All tasks are available in C or FORTRAN and can be found in the `[C|Fortan]/task[0-3]` subdirectories. There you will also find Makefiles that are set up so that you can compile and submit all necessary tasks.\n",
-    "\n",
-    "Please choose from the task below.\n",
-    "\n",
-    "\n",
-    "* [Task 0](#task0) Accelerate a CPU Jacobi solver with OpenACC relying on Unified Memory for data movement using `–ta=tesla:managed`  \n",
-    "  [Solution 0](#solution0)\n",
-    "\n",
-    "* [Task 1](#task1) Fix memory access pattern of OpenACC accelerated Jacobi Solver  \n",
-    "  [Solution 1](#solution1)\n",
-    "\n",
-    "* [Task 2](#task2) Use MPI to make OpenACC accelerated Jacobi Solver scale to multiple GPUs  \n",
-    "  [Solution 2](#solution2)\n",
-    "\n",
-    "* [Task 3](#task3) Hide MPI communication time by overlapping communication and \n",
-    "\tcomputation in a MPI+OpenACC multi GPU Jacobi Solver  \n",
-    "  [Solution 3](#solution3)\n",
-    "  \n",
-    "  \n",
-    "* [Suvery](#survey) Please remember to take the survey !\n",
-    "    \n",
-    "### Make Targets <a name=\"make\"></a>\n",
-    "\n",
-    "For all tasks we have defined the following make targets. \n",
-    "\n",
-    "* __run__:  \n",
-    "   run `poisson2d`\n",
-    "* __poisson2d__:  \n",
-    "  build `poisson2d` binary (default)\n",
-    "* __profile__:  \n",
-    "  profile with `pgprof`\n",
-    "* __*.solution__:  \n",
-    "  same as above for the solution (e.g. `make poisson2d.solution` or `make run.solution`)"
+    "%cd $basedir/task0"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "[Back to Top](#top)\n",
-    "\n",
+    "---\n",
     "---"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "## Task 0: <a name=\"task0\"></a>Using OpenACC\n",
-    "\n",
-    "\n",
-    "### Description\n",
-    "\n",
-    "The goal of this task is to accelerate a CPU Jacobi solver with OpenACC relying on Unified Memory for data movement using `–ta=tesla:managed`.\n",
-    "\n",
-    "Your task is to:\n",
-    "\n",
-    "* Parallelize Loops with OpenACC parallel loop\n",
-    "\n",
-    "_Look for_ __TODOs__ in the code.\n",
-    "\n",
-    "Look at the output generated by the PGI compiler (enabled by the `-Minfo=accel` option) to see how the compiler parallelizes the code.\n",
-    "\n",
-    "\n",
-    "#### Code\n",
-    "\n",
-    "You can open the source code either in a terminal in an editor. Navigate to `(C|Fortran)/task0/` and open `poisson2d.c` in a editor of your choice.\n",
-    "\n",
-    "If your are using the jupyter approach by following the link (for the language of your choice), This will open the source code in an editor in a new browser tab/window.\n",
-    "\n",
-    "* [C Version](/edit/C/task0/poisson2d.c)\n",
-    "* [Fortran Version](/edit/FORTAN/task0/poisson2d.F03)\n",
+    "# Solutions<a name=\"solutions\"></a>\n",
     "\n",
-    "#### File browser\n",
+    "Below are suggested solutions. This is only a short description of the solution, but the `poisson2d.solution.(c|F03)` files linked below have the full source code. If you want to run / profile the solutions feel free to duplicate the cells for the tasks and change the [make target](#make) to the `*.solution` ones.\n",
     "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task0/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task0/)\n",
+    "[Back to Top](#top)\n",
     "\n",
-    "__Before__ executing any of the cells below first execute the next cell to change to the right directory."
+    "---"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T16:08:47.160912Z",
-     "start_time": "2018-11-06T16:08:47.155534Z"
-    }
+    "exercise": "solution"
    },
-   "outputs": [],
    "source": [
-    "%cd $basedir/task0"
+    "## Solution 0:<a name=\"solution0\"></a>\n",
+    "\n",
+    "```C++\n",
+    "#pragma acc parallel loop\n",
+    "for (int ix = ix_start; ix < ix_end; ix++)\n",
+    "{\n",
+    "    #pragma acc loop\n",
+    "    for( int iy = iy_start; iy < iy_end; iy++ )\n",
+    "    {\n",
+    "        Anew[iy*nx+ix] = -0.25 * (rhs[iy*nx+ix] - ( A[iy*nx+ix+1] + A[iy*nx+ix-1]\n",
+    "                                               + A[(iy-1)*nx+ix] + A[(iy+1)*nx+ix] ));\n",
+    "        error = fmaxr( error, fabsr(Anew[iy*nx+ix]-A[iy*nx+ix]));\n",
+    "    }\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "#### Code\n",
+    "\n",
+    "* [C Version](/C/task0/poisson2d.solution.c?edit=1)\n",
+    "* [Fortran Version](/edit/./FORTRAN/task0/poisson2d.solution.F03)\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "#### Compilation and Execution\n",
+    "#### Compiling, Running and Profiling\n",
     "\n",
-    "If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call `make` with the desired [target](#make).\n",
-    "Alternatively you can just navigate to the right directory and execute `make <target>` in your terminal."
+    "You can compile, run and profile the solution with the next cells.  __After__ the profiling finished the output file `poisson2d.solution.pgprof`  can be downloaded from here: [C Version](/tree/./C/task0/poisson2d.solution.pgprof?download=1) / [Fortran Version](./FORTRAN/task0/poisson2d.solution.pgprof?download=1).    "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-06T16:08:51.631731Z",
-     "start_time": "2018-11-06T16:08:51.481285Z"
-    }
+    "exercise": "solution"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task0\n"
+     ]
+    }
+   ],
    "source": [
-    "checkdir('task0')\n",
-    "!make"
+    "%cd $basedir/task0"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 4,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pgcc -c -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,managed poisson2d_serial.c -o poisson2d_serial.o\n",
+      "pgcc -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,managed poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution\n",
+      "poisson2d.solution.c:\n",
+      "main:\n",
+      "     66, Generating Tesla code\n",
+      "         67, #pragma acc loop gang /* blockIdx.x */\n",
+      "         68, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "     66, Generating implicit copyout(A[:])\n",
+      "     68, Loop is parallelizable\n",
+      "     88, Generating Tesla code\n",
+      "         89, #pragma acc loop gang /* blockIdx.x */\n",
+      "         90, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "         94, Generating implicit reduction(max:error)\n",
+      "     88, Generating implicit copyin(A[:],rhs[:])\n",
+      "         Generating implicit copyout(Anew[:])\n",
+      "     90, Loop is parallelizable\n",
+      "     98, Generating Tesla code\n",
+      "         99, #pragma acc loop gang /* blockIdx.x */\n",
+      "        100, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "     98, Generating implicit copyin(Anew[:])\n",
+      "         Generating implicit copyout(A[:])\n",
+      "    100, Loop is parallelizable\n",
+      "    106, Generating Tesla code\n",
+      "        107, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "    106, Generating implicit copyin(A[:])\n",
+      "         Generating implicit copyout(A[nx*(ny-1)+1:2046])\n",
+      "    111, Generating Tesla code\n",
+      "        112, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "    111, Generating implicit copy(A[:])\n"
+     ]
+    }
+   ],
    "source": [
     "checkdir('task0')\n",
-    "!make run"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Profiling\n",
-    "\n",
-    "You can profile the code by executing the next cell. __After__ the profiling finished the output file `poisson2d.pgprof`  can be downloaded from here: [C Version](/tree/C/task0/poisson2d.pgprof?download=1) / [Fortran Version](/tree/FORTRAN/task0/poisson2d.pgprof?download=1).\n",
-    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu.     \n",
-    "    "
+    "!make poisson2d.solution"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 5,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./poisson2d.solution\n",
+      "Job <25189> is submitted to default queue <batch>.\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "Jacobi relaxation Calculation: 2048 x 2048 mesh\n",
+      "Calculate reference solution and time serial CPU execution.\n",
+      "    0, 0.249999\n",
+      "  100, 0.249760\n",
+      "  200, 0.249522\n",
+      "  300, 0.249285\n",
+      "  400, 0.249048\n",
+      "GPU execution.\n",
+      "    0, 0.249999\n",
+      "  100, 0.249760\n",
+      "  200, 0.249522\n",
+      "  300, 0.249285\n",
+      "  400, 0.249048\n",
+      "2048x2048: 1 CPU:   5.4684 s, 1 GPU:   0.1884 s, speedup:    29.02\n"
+     ]
+    }
+   ],
    "source": [
     "checkdir('task0')\n",
-    "!make profile"
+    "!make run.solution"
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off --openmp-profiling off  -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.pgprof ./poisson2d.solution 10\n",
+      "Job <25190> is submitted to default queue <batch>.\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "==91820== PGPROF is profiling process 91820, command: ./poisson2d.solution 10\n",
+      "==91820== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.pgprof\n",
+      "Jacobi relaxation Calculation: 2048 x 2048 mesh\n",
+      "Calculate reference solution and time serial CPU execution.\n",
+      "    0, 0.249999\n",
+      "GPU execution.\n",
+      "    0, 0.249999\n",
+      "2048x2048: 1 CPU:   0.1230 s, 1 GPU:   0.0189 s, speedup:     6.51\n",
+      "mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.pgprof .\n"
+     ]
+    }
+   ],
    "source": [
-    "#### References\n",
-    "\n",
-    "1. http://www.openacc.org\n",
-    "2. [OpenACC Reference Card](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)"
+    "checkdir('task0')\n",
+    "!make profile.solution"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
     "[Back to Top](#top)\n",
     "\n",
@@ -292,376 +339,698 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "## Task 1:<a name=\"task1\"></a> Memory Access Patterns\n",
-    "\n",
-    "\n",
-    "### Description\n",
-    "The goal of this task is to fix the memory access pattern of OpenACC accelerated Jacobi Solver. Generate the profile, download the generated profiles and import them into pgprof / nvprof.\n",
-    "There use “Global Memory Access Pattern” experiment to analyze the issue.\n",
+    "## Solution 1:<a name=\"solution1\"></a>\n",
     "\n",
-    "_Look for_ __TODOs__ in the code.\n",
+    "Swap the `ix` and `iy` loops to make sure that `ix` is the fastest running index \n",
     "\n",
+    "```C\n",
+    "#pragma acc parallel loop\n",
+    "for (int iy = iy_start; iy < iy_end; iy++)\n",
+    "{\n",
+    "    for( int ix = ix_start; ix < ix_end; ix++ )\n",
+    "    {\n",
+    "        Anew[iy*nx+ix] = -0.25 * (rhs[iy*nx+ix] - ( A[iy*nx+ix+1] + A[iy*nx+ix-1]\n",
+    "                                               + A[(iy-1)*nx+ix] + A[(iy+1)*nx+ix] ));\n",
+    "        error = fmaxr( error, fabsr(Anew[iy*nx+ix]-A[iy*nx+ix]));\n",
+    "    }\n",
+    "}\n",
+    "```\n",
     "\n",
     "#### Code\n",
     "\n",
-    "* [C Version](/edit/C/task1/poisson2d.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task1/poisson2d.F03)\n",
-    "\n",
-    "#### Directory browser\n",
-    "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task1/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task1/)\n",
-    "\n",
-    "__Before__ executing any of the cells below first execute the next cell to change to the right directory."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%cd $basedir/task1"
+    "* [C Version](/edit/C/task1/poisson2d.solution.c)\n",
+    "* [Fortran Version](/edit/FORTRAN/task1/poisson2d.solution.F03)"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "#### Compilation and Execution\n",
+    "#### Compiling, Running and Profiling\n",
     "\n",
-    "If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call `make` with the desired [target](#make).\n",
-    "Alternatively you can just navigate to the right directory and execute `make <target>` in your terminal."
+    "You can compile, run and profile the solution with the next cells.  __After__ the profiling finished the output file `poisson2d.solution.pgprof`  can be downloaded from here: [C Version](/tree/C/task1/pgprof.poisson2d.Task1.solution.tar.gz?download=1) / [Fortran Version](/tree/FORTRAN/task1/pgprof.poisson2d.Task1.solution.tar.gz?download=1).  "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 7,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task1\n"
+     ]
+    }
+   ],
    "source": [
-    "checkdir('task1')\n",
-    "!make"
+    "%cd $basedir/task1"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 8,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pgcc -c -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,managed,lineinfo poisson2d_serial.c -o poisson2d_serial.o\n",
+      "pgcc -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,managed,lineinfo poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution\n",
+      "poisson2d.solution.c:\n",
+      "main:\n",
+      "     66, Generating Tesla code\n",
+      "         67, #pragma acc loop gang /* blockIdx.x */\n",
+      "         68, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "     66, Generating implicit copyout(A[:])\n",
+      "     68, Loop is parallelizable\n",
+      "     88, Generating Tesla code\n",
+      "         89, #pragma acc loop gang /* blockIdx.x */\n",
+      "         90, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "         94, Generating implicit reduction(max:error)\n",
+      "     88, Generating implicit copyin(A[:],rhs[:])\n",
+      "         Generating implicit copyout(Anew[:])\n",
+      "     90, Loop is parallelizable\n",
+      "     98, Generating Tesla code\n",
+      "         99, #pragma acc loop gang /* blockIdx.x */\n",
+      "        100, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "     98, Generating implicit copyin(Anew[:])\n",
+      "         Generating implicit copyout(A[:])\n",
+      "    100, Loop is parallelizable\n",
+      "    106, Generating Tesla code\n",
+      "        107, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "    106, Generating implicit copyin(A[:])\n",
+      "         Generating implicit copyout(A[nx*(ny-1)+1:2046])\n",
+      "    111, Generating Tesla code\n",
+      "        112, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "    111, Generating implicit copy(A[:])\n"
+     ]
+    }
+   ],
    "source": [
     "checkdir('task1')\n",
-    "!make run"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Profiling\n",
-    "\n",
-    "You can profile the code by executing the next cell. __After__ the profiling finished the output files can be downloaded from here: [C Version](/tree/C/task1/pgprof.poisson2d.Task1.tar.gz?download=1) / [Fortran Version](/tree/FORTRAN/task1/pgprof.Task1.poisson2d.tar.gz?download=1).\n",
-    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu.     \n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!ls"
+    "!make poisson2d.solution"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 9,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS ./poisson2d.solution\n",
+      "Job <25191> is submitted to default queue <batch>.\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "Jacobi relaxation Calculation: 2048 x 2048 mesh\n",
+      "Calculate reference solution and time serial CPU execution.\n",
+      "    0, 0.249999\n",
+      "  100, 0.249760\n",
+      "  200, 0.249522\n",
+      "  300, 0.249285\n",
+      "  400, 0.249048\n",
+      "GPU execution.\n",
+      "    0, 0.249999\n",
+      "  100, 0.249760\n",
+      "  200, 0.249522\n",
+      "  300, 0.249285\n",
+      "  400, 0.249048\n",
+      "2048x2048: 1 CPU:   5.4691 s, 1 GPU:   0.1866 s, speedup:    29.31\n"
+     ]
+    }
+   ],
    "source": [
     "checkdir('task1')\n",
-    "!make profile"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For the _Global Memory Load/Store Efficiency_ the `make profile` command also generated a CSV file that you can import and view with the cell below.  \n",
-    "If you purely work in a terminal you can view the same output by running `pgprof -i poisson2d.efficiency.pgprof`."
+    "!make run.solution"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 10,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off --openmp-profiling off  -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.timeline.pgprof ./poisson2d.solution 3\n",
+      "Job <25192> is submitted to default queue <batch>.\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "==92054== PGPROF is profiling process 92054, command: ./poisson2d.solution 3\n",
+      "==92054== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.timeline.pgprof\n",
+      "Jacobi relaxation Calculation: 2048 x 2048 mesh\n",
+      "Calculate reference solution and time serial CPU execution.\n",
+      "    0, 0.249999\n",
+      "GPU execution.\n",
+      "    0, 0.249999\n",
+      "2048x2048: 1 CPU:   0.0465 s, 1 GPU:   0.0154 s, speedup:     3.01\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off --openmp-profiling off  --analysis-metrics -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.metrics.pgprof ./poisson2d.solution 3\n",
+      "Job <25193> is submitted to default queue <batch>.\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "==71647== PGPROF is profiling process 71647, command: ./poisson2d.solution 3\n",
+      "==71647== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.\n",
+      "==71647== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.metrics.pgprof\n",
+      "Jacobi relaxation Calculation: 2048 x 2048 mesh\n",
+      "Calculate reference solution and time serial CPU execution.\n",
+      "    0, 0.249999\n",
+      "GPU execution.\n",
+      "    0, 0.249999\n",
+      "2048x2048: 1 CPU:   0.0476 s, 1 GPU:  12.4561 s, speedup:     0.00\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off --openmp-profiling off  --metrics gld_efficiency,gst_efficiency -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.efficiency.pgprof ./poisson2d.solution 3\n",
+      "Job <25194> is submitted to default queue <batch>.\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "==92292== PGPROF is profiling process 92292, command: ./poisson2d.solution 3\n",
+      "==92292== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.\n",
+      "==92292== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.efficiency.pgprof\n",
+      "Jacobi relaxation Calculation: 2048 x 2048 mesh\n",
+      "Calculate reference solution and time serial CPU execution.\n",
+      "    0, 0.249999\n",
+      "GPU execution.\n",
+      "    0, 0.249999\n",
+      "2048x2048: 1 CPU:   0.0487 s, 1 GPU:   0.6897 s, speedup:     0.07\n",
+      "pgprof --csv -i /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.efficiency.pgprof 2>&1 | grep -v \"======\" > poisson2d.solution.efficiency.csv\n",
+      "mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.*.pgprof .\n",
+      "tar -cvzf pgprof.poisson2d.Task1.solution.tar.gz  poisson2d.solution.*.pgprof\n",
+      "poisson2d.solution.efficiency.pgprof\n",
+      "poisson2d.solution.metrics.pgprof\n",
+      "poisson2d.solution.timeline.pgprof\n"
+     ]
+    }
+   ],
    "source": [
     "checkdir('task1')\n",
-    "data_frame = pandas.read_csv('poisson2d.efficiency.csv', sep=',')\n",
-    "data_frame"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### References\n",
-    "\n",
-    "1. http://www.openacc.org\n",
-    "2. [OpenACC Reference Card](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
-    "3. [pgprof Quickstart](https://www.pgroup.com/resources/pgprof-quickstart.htm)\n",
-    "4. [CUDA Toolkit Documentation - Profiler](https://docs.nvidia.com/cuda/profiler-users-guide/index.html) _pgprof is based on the NVIDIA Visual Profiler_"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "[Back to Top](#top)\n",
-    "\n",
-    "---"
+    "!make profile.solution"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "## Task 2: <a name=\"task2\"></a>Apply Domain Decomposition\n",
-    "\n",
-    "\n",
-    "### Description\n",
-    "\n",
-    "Your task is to apply a domain decomposition and use MPI for the data exchange. Specifically you should\n",
-    "* Handle GPU affinity\n",
-    "* Do the Halo Exchange\n",
-    "\n",
-    "_Look for_ __TODOs__\n",
-    "\n",
-    "When profiling take a look at how kernel and communication times change when you scale to more GPUs.\n",
-    "\n",
-    "#### Code\n",
-    "\n",
-    "* [C Version](/edit/C/task2/poisson2d.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task2/poisson2d.F03)\n",
-    "\n",
-    "#### File browser\n",
-    "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task2/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task2/)\n",
-    "\n",
-    "__Before__ executing any of the cells below first execute the next cell to change to the right directory."
+    "For the _Global Memory Load/Store Efficiency_ the `make profile` command also generated a CSV file that you can import and view with the cell below.  \n",
+    "If you purely work in a terminal you can view the same output by running `pgprof -i poisson2d.efficiency.solution.pgprof`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 11,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Device</th>\n",
+       "      <th>Kernel</th>\n",
+       "      <th>Invocations</th>\n",
+       "      <th>Metric Name</th>\n",
+       "      <th>Metric Description</th>\n",
+       "      <th>Min</th>\n",
+       "      <th>Max</th>\n",
+       "      <th>Avg</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
+       "      <td>main_98_gpu</td>\n",
+       "      <td>3</td>\n",
+       "      <td>gld_efficiency</td>\n",
+       "      <td>Global Memory Load Efficiency</td>\n",
+       "      <td>90.868353%</td>\n",
+       "      <td>90.896134%</td>\n",
+       "      <td>90.881874%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
+       "      <td>main_98_gpu</td>\n",
+       "      <td>3</td>\n",
+       "      <td>gst_efficiency</td>\n",
+       "      <td>Global Memory Store Efficiency</td>\n",
+       "      <td>88.956522%</td>\n",
+       "      <td>88.956522%</td>\n",
+       "      <td>88.956522%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
+       "      <td>main_106_gpu</td>\n",
+       "      <td>3</td>\n",
+       "      <td>gld_efficiency</td>\n",
+       "      <td>Global Memory Load Efficiency</td>\n",
+       "      <td>94.722222%</td>\n",
+       "      <td>94.722222%</td>\n",
+       "      <td>94.722222%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
+       "      <td>main_106_gpu</td>\n",
+       "      <td>3</td>\n",
+       "      <td>gst_efficiency</td>\n",
+       "      <td>Global Memory Store Efficiency</td>\n",
+       "      <td>88.956522%</td>\n",
+       "      <td>88.956522%</td>\n",
+       "      <td>88.956522%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
+       "      <td>main_94_gpu__red</td>\n",
+       "      <td>3</td>\n",
+       "      <td>gld_efficiency</td>\n",
+       "      <td>Global Memory Load Efficiency</td>\n",
+       "      <td>99.756335%</td>\n",
+       "      <td>99.756335%</td>\n",
+       "      <td>99.756335%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
+       "      <td>main_94_gpu__red</td>\n",
+       "      <td>3</td>\n",
+       "      <td>gst_efficiency</td>\n",
+       "      <td>Global Memory Store Efficiency</td>\n",
+       "      <td>25.000000%</td>\n",
+       "      <td>25.000000%</td>\n",
+       "      <td>25.000000%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
+       "      <td>main_66_gpu</td>\n",
+       "      <td>1</td>\n",
+       "      <td>gld_efficiency</td>\n",
+       "      <td>Global Memory Load Efficiency</td>\n",
+       "      <td>0.000000%</td>\n",
+       "      <td>0.000000%</td>\n",
+       "      <td>0.000000%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
+       "      <td>main_66_gpu</td>\n",
+       "      <td>1</td>\n",
+       "      <td>gst_efficiency</td>\n",
+       "      <td>Global Memory Store Efficiency</td>\n",
+       "      <td>100.000000%</td>\n",
+       "      <td>100.000000%</td>\n",
+       "      <td>100.000000%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
+       "      <td>main_88_gpu</td>\n",
+       "      <td>3</td>\n",
+       "      <td>gld_efficiency</td>\n",
+       "      <td>Global Memory Load Efficiency</td>\n",
+       "      <td>91.834032%</td>\n",
+       "      <td>91.855433%</td>\n",
+       "      <td>91.843628%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
+       "      <td>main_88_gpu</td>\n",
+       "      <td>3</td>\n",
+       "      <td>gst_efficiency</td>\n",
+       "      <td>Global Memory Store Efficiency</td>\n",
+       "      <td>88.845486%</td>\n",
+       "      <td>88.845486%</td>\n",
+       "      <td>88.845486%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
+       "      <td>main_111_gpu</td>\n",
+       "      <td>3</td>\n",
+       "      <td>gld_efficiency</td>\n",
+       "      <td>Global Memory Load Efficiency</td>\n",
+       "      <td>25.000000%</td>\n",
+       "      <td>25.000000%</td>\n",
+       "      <td>25.000000%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
+       "      <td>main_111_gpu</td>\n",
+       "      <td>3</td>\n",
+       "      <td>gst_efficiency</td>\n",
+       "      <td>Global Memory Store Efficiency</td>\n",
+       "      <td>25.000000%</td>\n",
+       "      <td>25.000000%</td>\n",
+       "      <td>25.000000%</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                      Device            Kernel  Invocations     Metric Name  \\\n",
+       "0   Tesla V100-SXM2-16GB (0)       main_98_gpu            3  gld_efficiency   \n",
+       "1   Tesla V100-SXM2-16GB (0)       main_98_gpu            3  gst_efficiency   \n",
+       "2   Tesla V100-SXM2-16GB (0)      main_106_gpu            3  gld_efficiency   \n",
+       "3   Tesla V100-SXM2-16GB (0)      main_106_gpu            3  gst_efficiency   \n",
+       "4   Tesla V100-SXM2-16GB (0)  main_94_gpu__red            3  gld_efficiency   \n",
+       "5   Tesla V100-SXM2-16GB (0)  main_94_gpu__red            3  gst_efficiency   \n",
+       "6   Tesla V100-SXM2-16GB (0)       main_66_gpu            1  gld_efficiency   \n",
+       "7   Tesla V100-SXM2-16GB (0)       main_66_gpu            1  gst_efficiency   \n",
+       "8   Tesla V100-SXM2-16GB (0)       main_88_gpu            3  gld_efficiency   \n",
+       "9   Tesla V100-SXM2-16GB (0)       main_88_gpu            3  gst_efficiency   \n",
+       "10  Tesla V100-SXM2-16GB (0)      main_111_gpu            3  gld_efficiency   \n",
+       "11  Tesla V100-SXM2-16GB (0)      main_111_gpu            3  gst_efficiency   \n",
+       "\n",
+       "                Metric Description          Min          Max          Avg  \n",
+       "0    Global Memory Load Efficiency   90.868353%   90.896134%   90.881874%  \n",
+       "1   Global Memory Store Efficiency   88.956522%   88.956522%   88.956522%  \n",
+       "2    Global Memory Load Efficiency   94.722222%   94.722222%   94.722222%  \n",
+       "3   Global Memory Store Efficiency   88.956522%   88.956522%   88.956522%  \n",
+       "4    Global Memory Load Efficiency   99.756335%   99.756335%   99.756335%  \n",
+       "5   Global Memory Store Efficiency   25.000000%   25.000000%   25.000000%  \n",
+       "6    Global Memory Load Efficiency    0.000000%    0.000000%    0.000000%  \n",
+       "7   Global Memory Store Efficiency  100.000000%  100.000000%  100.000000%  \n",
+       "8    Global Memory Load Efficiency   91.834032%   91.855433%   91.843628%  \n",
+       "9   Global Memory Store Efficiency   88.845486%   88.845486%   88.845486%  \n",
+       "10   Global Memory Load Efficiency   25.000000%   25.000000%   25.000000%  \n",
+       "11  Global Memory Store Efficiency   25.000000%   25.000000%   25.000000%  "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "%cd $basedir/task2"
+    "data_frame_solution = pandas.read_csv('poisson2d.solution.efficiency.csv', sep=',')\n",
+    "data_frame_solution"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "#### Compilation\n",
+    "[Back to Top](#top)\n",
     "\n",
-    "If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call `make` with the desired [target](#make).\n",
-    "Alternatively you can just navigate to the right directory and execute `make <target>` in your terminal."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "checkdir('task2')\n",
-    "!make poisson2d"
+    "---"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Running\n",
-    "\n",
-    "For the Multi-GPU version you can set the number of GPUs / MPI ranks using the variable `NP`. On _Ascent_ within a single node you can use up to 6 GPUs."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
    "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-07T16:45:01.306609Z",
-     "start_time": "2018-11-07T16:45:01.212255Z"
-    },
-    "scrolled": true
+    "exercise": "solution"
    },
-   "outputs": [],
    "source": [
-    "checkdir('task2')\n",
-    "!NP=2 make run"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Scaling\n",
-    "\n",
-    "You can do a simple scaling run for up to all 6 GPUs in the node by executing the next cell."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "checkdir('task2')\n",
-    "!NP=1 make run | grep speedup > scale.out\n",
-    "!NP=2 make run | grep speedup >> scale.out\n",
-    "!NP=4 make run | grep speedup >>  scale.out\n",
-    "!NP=6 make run | grep speedup >>  scale.out\n",
-    "data_frame2 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
+    "## Solution 2:<a name=\"solution2\"></a>\n",
     "\n",
-    "!rm scale.out\n",
+    "Set the GPU used by the rank using `#pragma acc set device_num`\n",
+    "```C\n",
+    "//Initialize MPI and determine rank and size\n",
+    "MPI_Init(&argc, &argv);\n",
+    "MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n",
+    "MPI_Comm_size(MPI_COMM_WORLD, &size);\n",
     "\n",
-    "data_frame2b=data_frame2.iloc[:,[5,7,10,12]].copy()\n",
-    "data_frame2b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Profiling\n",
+    "#pragma acc set device_num( rank )\n",
     "\n",
-    "You can profile the code by executing the next cell. __After__ the profiling finished the output files can be downloaded from here: [C Version](/tree/C/task2/pgprof.poisson2d.Task2.tar.gz?download=1) / [Fortran Version](/tree/FORTRAN/task2/pgprof.poisson2d.Task2.tar.gz?download=1).\n",
-    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu. Remember to use the _Multiple processes_ option in the assistant.      \n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "checkdir('task2')\n",
-    "!NP=2 make profile"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### References\n",
-    "1. http://www.openacc.org\n",
-    "2. [OpenACC Reference Card](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
-    "3. https://www.open-mpi.org/doc/v3.1/"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "[Back to Top](#top)\n",
+    "real* restrict const A    = (real*) malloc(nx*ny*sizeof(real));\n",
+    "real* restrict const Aref = (real*) malloc(nx*ny*sizeof(real));\n",
+    "real* restrict const Anew = (real*) malloc(nx*ny*sizeof(real));\n",
+    "real* restrict const rhs  = (real*) malloc(nx*ny*sizeof(real));\n",
+    "```\n",
     "\n",
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Task 3: <a name=\"task3\"></a>Hide MPI Communication time\n",
     "\n",
-    "To overlap compute and communication you will need to\n",
+    "Apply domain decomposition\n",
+    "```C\n",
+    "// Ensure correctness if ny%size != 0\n",
+    "int chunk_size = ceil( (1.0*ny)/size );\n",
     "\n",
-    "* start the copy loop asynchronously\n",
-    "* wait for async copy loop after MPI communication has finished\n",
+    "int iy_start = rank * chunk_size;\n",
+    "int iy_end   = iy_start + chunk_size;\n",
     "\n",
-    "_Look for_ __TODOs__.\n",
+    "// Do not process boundaries\n",
+    "iy_start = max( iy_start, 1 );\n",
+    "iy_end = min( iy_end, ny - 1 );\n",
+    "```\n",
     "\n",
-    "Compare the scaling and efficiency with the results from the previous task. Check for the overlap in the profiler.\n",
+    "Exchange data\n",
+    "```C\n",
+    "//Periodic boundary conditions\n",
+    "int top    = (rank == 0) ? (size-1) : rank-1;\n",
+    "int bottom = (rank == (size-1)) ? 0 : rank+1;\n",
+    "#pragma acc host_data use_device( A )\n",
+    "{\n",
+    "    double start_mpi = MPI_Wtime();\n",
+    "    //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom\n",
+    "    MPI_Sendrecv( A+iy_start*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,\n",
+    "                  A+iy_end*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,\n",
+    "                  MPI_COMM_WORLD, MPI_STATUS_IGNORE );\n",
     "\n",
-    "_Optional_: Try to understand how well communication and compute overlap is able to improve efficiency when scaling to more GPUs.\n",
+    "    //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top\n",
+    "    MPI_Sendrecv( A+(iy_end-1)*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,\n",
+    "                  A+(iy_start-1)*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,\n",
+    "                  MPI_COMM_WORLD, MPI_STATUS_IGNORE );\n",
+    "    mpi_time += MPI_Wtime() - start_mpi;\n",
+    "}\n",
+    "```\n",
     "\n",
     "#### Code\n",
     "\n",
-    "* [C Version](/edit/C/task3/poisson2d.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task3/poisson2d.F03)\n",
-    "\n",
-    "#### File browser\n",
-    "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task3/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task3/)\n",
-    "\n",
-    "__Before__ executing any of the cells below first execute the next cell to change to the right directory."
+    "* [C Version](/edit/C/task2/poisson2d.solution.c)\n",
+    "* [Fortran Version](/edit/FORTRAN/task2/poisson2d.solution.F03)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "%cd $basedir/task3"
+    "#### Compiling, Running and Profiling\n",
+    "\n",
+    "You can compile, run and profile the solution with the next cells.  You can profile the code by executing the next cell. __After__ the profiling completed download the tarball containing the profiles (`pgprof.Task2.solution.poisson2d.tar.gz`) with the File Browser. \n",
+    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu. Remember to use the _Multiple processes_ option in the assistant. "
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task2\n"
+     ]
+    }
+   ],
    "source": [
-    "#### Compilation\n",
-    "\n",
-    "If you are using the jupyter notebook approach you can execute the cells below. They will put you in the right directory. There you can call `make` with the desired [target](#make).\n",
-    "Alternatively you can just navigate to the right directory and execute `make <target>` in your terminal."
+    "%cd $basedir/task2"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 13,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mpicc -c -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d_serial.c -o poisson2d_serial.o\n",
+      "poisson2d_serial:\n",
+      "     36, Generating present(Anew[:],rhs[:],Aref[:])\n",
+      "     39, Generating update device(rhs[:ny*nx],Aref[:ny*nx])\n",
+      "     42, Generating Tesla code\n",
+      "         43, #pragma acc loop gang /* blockIdx.x */\n",
+      "         44, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "         49, Generating implicit reduction(max:error)\n",
+      "     44, Loop is parallelizable\n",
+      "     53, Generating Tesla code\n",
+      "         54, #pragma acc loop gang /* blockIdx.x */\n",
+      "         55, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "     55, Loop is parallelizable\n",
+      "     61, Generating Tesla code\n",
+      "         62, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "     66, Generating Tesla code\n",
+      "         67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "     78, Generating update self(Aref[:ny*nx])\n",
+      "mpicc -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution\n",
+      "poisson2d.solution.c:\n",
+      "main:\n",
+      "     71, Generating enter data create(Aref[:ny*nx],rhs[:ny*nx],A[:ny*nx],Anew[:ny*nx])\n",
+      "     87, Generating present(Aref[:],A[:])\n",
+      "         Generating Tesla code\n",
+      "         88, #pragma acc loop gang /* blockIdx.x */\n",
+      "         89, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "     89, Loop is parallelizable\n",
+      "    140, Generating update device(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)],rhs[nx*iy_start:nx*(iy_end-iy_start)])\n",
+      "    143, Generating present(A[:],rhs[:],Anew[:])\n",
+      "         Generating Tesla code\n",
+      "        144, #pragma acc loop gang /* blockIdx.x */\n",
+      "        145, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "        149, Generating implicit reduction(max:error)\n",
+      "    145, Loop is parallelizable\n",
+      "    157, Generating present(Anew[:],A[:])\n",
+      "         Generating Tesla code\n",
+      "        158, #pragma acc loop gang /* blockIdx.x */\n",
+      "        159, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "    159, Loop is parallelizable\n",
+      "    184, Generating present(A[:])\n",
+      "         Generating Tesla code\n",
+      "        185, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "    195, Generating update self(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)])\n",
+      "    213, Generating exit data delete(rhs[:1],Aref[:1],A[:1],Anew[:1])\n"
+     ]
+    }
+   ],
    "source": [
-    "checkdir('task3')\n",
-    "!make poisson2d"
+    "checkdir('task2')\n",
+    "!make poisson2d.solution"
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" ./poisson2d.solution\n",
+      "Job <25195> is submitted to default queue <batch>.\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "Jacobi relaxation Calculation: 4096 x 4096 mesh\n",
+      "Calculate reference solution and time serial execution.\n",
+      "    0, 0.250000\n",
+      "  100, 0.249940\n",
+      "  200, 0.249880\n",
+      "  300, 0.249821\n",
+      "  400, 0.249761\n",
+      "  500, 0.249702\n",
+      "  600, 0.249642\n",
+      "  700, 0.249583\n",
+      "  800, 0.249524\n",
+      "  900, 0.249464\n",
+      "Parallel execution.\n",
+      "    0, 0.250000\n",
+      "  100, 0.249940\n",
+      "  200, 0.249880\n",
+      "  300, 0.249821\n",
+      "  400, 0.249761\n",
+      "  500, 0.249702\n",
+      "  600, 0.249642\n",
+      "  700, 0.249583\n",
+      "  800, 0.249524\n",
+      "  900, 0.249464\n",
+      "Num GPUs: 2.\n",
+      "4096x4096: 1 GPU:   1.3165 s, 2 GPUs:   0.7221 s, speedup:     1.82, efficiency:    91.17%\n",
+      "MPI time:   0.0422 s, inter GPU BW:     2.89 GiB/s\n"
+     ]
+    }
+   ],
    "source": [
-    "#### Running\n",
-    "\n",
-    "For the Multi-GPU version you can set the number of GPUs / MPI ranks using the variable `NP`. On _Ascent_ within a single node you can use up to 6 GPUs."
+    "checkdir('task2')\n",
+    "!NP=2 make run.solution"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 15,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task2.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10\n",
+      "Job <25196> is submitted to default queue <batch>.\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "==92521== PGPROF is profiling process 92521, command: ./poisson2d.solution 10\n",
+      "==92520== PGPROF is profiling process 92520, command: ./poisson2d.solution 10\n",
+      "==92520== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task2.NP2.1.pgprof\n",
+      "Jacobi relaxation Calculation: 4096 x 4096 mesh\n",
+      "Calculate reference solution and time serial execution.\n",
+      "    0, 0.250000\n",
+      "Parallel execution.\n",
+      "    0, 0.250000\n",
+      "Num GPUs: 2.\n",
+      "4096x4096: 1 GPU:   0.0224 s, 2 GPUs:   0.0130 s, speedup:     1.73, efficiency:    86.37%\n",
+      "MPI time:   0.0007 s, inter GPU BW:     1.75 GiB/s\n",
+      "==92521== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task2.NP2.0.pgprof\n",
+      "mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task2.NP2.?.pgprof  .\n",
+      "tar -cvzf pgprof.poisson2d.Task2.solution.tar.gz poisson2d.solution.Task2.NP2.?.pgprof\n",
+      "poisson2d.solution.Task2.NP2.0.pgprof\n",
+      "poisson2d.solution.Task2.NP2.1.pgprof\n"
+     ]
+    }
+   ],
    "source": [
-    "checkdir('task3')\n",
-    "!NP=2 make run"
+    "checkdir('task2')\n",
+    "!NP=2 make profile.solution"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
     "#### Scaling\n",
     "\n",
@@ -670,74 +1039,117 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 16,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>GPUs</th>\n",
+       "      <th>time [s]</th>\n",
+       "      <th>speedup</th>\n",
+       "      <th>efficiency</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.4201</td>\n",
+       "      <td>0.93,</td>\n",
+       "      <td>92.67%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>0.7157</td>\n",
+       "      <td>1.83,</td>\n",
+       "      <td>91.44%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>4</td>\n",
+       "      <td>0.4301</td>\n",
+       "      <td>3.08,</td>\n",
+       "      <td>76.91%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>6</td>\n",
+       "      <td>0.3037</td>\n",
+       "      <td>4.32,</td>\n",
+       "      <td>71.94%</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   GPUs  time [s] speedup efficiency\n",
+       "0     1    1.4201   0.93,     92.67%\n",
+       "1     2    0.7157   1.83,     91.44%\n",
+       "2     4    0.4301   3.08,     76.91%\n",
+       "3     6    0.3037   4.32,     71.94%"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "checkdir('task3')\n",
-    "!NP=1 make run | grep speedup > scale.out\n",
-    "!NP=2 make run | grep speedup >> scale.out\n",
-    "!NP=4 make run | grep speedup >>  scale.out\n",
-    "!NP=6 make run | grep speedup >>  scale.out\n",
-    "data_frame3 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
+    "checkdir('task2')\n",
+    "!NP=1 make run.solution | grep speedup > scale.out\n",
+    "!NP=2 make run.solution | grep speedup >> scale.out\n",
+    "!NP=4 make run.solution | grep speedup >>  scale.out\n",
+    "!NP=6 make run.solution | grep speedup >>  scale.out\n",
+    "data_frameS2 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
     "\n",
     "!rm scale.out\n",
     "\n",
-    "data_frame3b=data_frame3.iloc[:,[5,7,10,12]].copy()\n",
-    "data_frame3b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Profiling\n",
-    "\n",
-    "You can profile the code by executing the next cell. __After__ the profiling finished the output files can be downloaded from here: [C Version](/tree/C/task3/pgprof.poisson2d.Task3.tar.gz?download=1) / [Fortran Version](/tree/FORTRAN/task3/pgprof.poisson2d.Task3.tar.gz?download=1).\n",
-    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu. Remember to use the _Multiple processes_ option in the assistant.   \n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "checkdir('task3')\n",
-    "!NP=2 make profile"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### References\n",
-    "1. http://www.openacc.org\n",
-    "2. [OpenACC Reference Card](https://www.openacc.org/sites/default/files/inline-files/OpenACC%20API%202.6%20Reference%20Guide.pdf)\n",
-    "3. https://www.open-mpi.org/doc/v3.1/"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "***\n",
-    "***"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Solutions<a name=\"solutions\"></a>\n",
-    "\n",
-    "Below are suggested solutions. This is only a short description of the solution, but the `poisson2d.solution.(c|F03)` files linked below have the full source code. If you want to run / profile the solutions feel free to duplicate the cells for the tasks and change the [make target](#make) to the `*.solution` ones."
+    "data_frameS2b=data_frameS2.iloc[:,[5,7,10,12]].copy()\n",
+    "data_frameS2b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
     "[Back to Top](#top)\n",
     "\n",
@@ -746,149 +1158,377 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "## Solution 0:<a name=\"solution0\"></a>\n",
+    "## Solution 3:<a name=\"solution3\"></a>\n",
     "\n",
-    "```C++\n",
-    "#pragma acc parallel loop\n",
-    "for (int ix = ix_start; ix < ix_end; ix++)\n",
+    "\n",
+    "Update the boundaries first.\n",
+    "```C\n",
+    "#pragma acc parallel loop present(A,Anew)\n",
+    "for( int ix = ix_start; ix < ix_end; ix++ )\n",
     "{\n",
-    "    #pragma acc loop\n",
-    "    for( int iy = iy_start; iy < iy_end; iy++ )\n",
+    "    A[(iy_start)*nx+ix] = Anew[(iy_start)*nx+ix];\n",
+    "    A[(iy_end-1)*nx+ix] = Anew[(iy_end-1)*nx+ix];\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "Start the interior loop asynchronously so it can overlap with the MPI communication and wait at the end for the completion.\n",
+    "```C\n",
+    "#pragma acc parallel loop present(A,Anew) async\n",
+    "for (int iy = iy_start+1; iy < iy_end-1; iy++)\n",
+    "{\n",
+    "    for( int ix = ix_start; ix < ix_end; ix++ )\n",
     "    {\n",
-    "        Anew[iy*nx+ix] = -0.25 * (rhs[iy*nx+ix] - ( A[iy*nx+ix+1] + A[iy*nx+ix-1]\n",
-    "                                               + A[(iy-1)*nx+ix] + A[(iy+1)*nx+ix] ));\n",
-    "        error = fmaxr( error, fabsr(Anew[iy*nx+ix]-A[iy*nx+ix]));\n",
+    "        A[iy*nx+ix] = Anew[iy*nx+ix];\n",
     "    }\n",
     "}\n",
+    "\n",
+    "//Periodic boundary conditions\n",
+    "int top    = (rank == 0) ? (size-1) : rank-1;\n",
+    "int bottom = (rank == (size-1)) ? 0 : rank+1;\n",
+    "#pragma acc host_data use_device( A )\n",
+    "{\n",
+    "    double start_mpi = MPI_Wtime();\n",
+    "    //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom\n",
+    "    MPI_Sendrecv( A+iy_start*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,\n",
+    "                  A+iy_end*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,\n",
+    "                  MPI_COMM_WORLD, MPI_STATUS_IGNORE );\n",
+    "\n",
+    "    //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top\n",
+    "    MPI_Sendrecv( A+(iy_end-1)*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,\n",
+    "                  A+(iy_start-1)*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,\n",
+    "                  MPI_COMM_WORLD, MPI_STATUS_IGNORE );\n",
+    "    mpi_time += MPI_Wtime() - start_mpi;\n",
+    "}\n",
+    "#pragma acc wait\n",
     "```\n",
     "\n",
-    "#### Code\n",
     "\n",
-    "* [C Version](/edit/C/task0/poisson2d.solution.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task0/poisson2d.solution.F03)\n",
     "\n",
-    "#### File browser\n",
+    "#### Code\n",
     "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task0/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task0/)"
+    "* [C Version](/edit/C/task3/poisson2d.solution.c)\n",
+    "* [Fortran Version](/edit/FORTRAN/task3/poisson2d.solution.F03)\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
     "#### Compiling, Running and Profiling\n",
     "\n",
-    "You can compile, run and profile the solution with the next cells.  __After__ the profiling finished the output file `poisson2d.solution.pgprof`  can be downloaded from here: [C Version](/tree/C/task0/poisson2d.solution.pgprof?download=1) / [Fortran Version](/tree/FORTRAN/task0/poisson2d.solution.pgprof?download=1).    "
+    "You can compile, run and profile the solution with the next cells.  You can profile the code by executing the next cell. __After__ the profiling completed download the tarball containing the profiles (`pgprof.Task2.solution.poisson2d.tar.gz`) with the File Browser. \n",
+    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu. Remember to use the _Multiple processes_ option in the assistant.    "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
+   "execution_count": 17,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/autofs/nccsopen-svm1_home/mathiasw/sc17task/C/task0\n"
+      "/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task3\n"
      ]
     }
    ],
    "source": [
-    "%cd $basedir/task0"
+    "%cd $basedir/task3"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
+   "execution_count": 18,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "make: `poisson2d.solution' is up to date.\r\n"
+      "mpicc -c -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d_serial.c -o poisson2d_serial.o\n",
+      "poisson2d_serial:\n",
+      "     36, Generating present(Anew[:],rhs[:],Aref[:])\n",
+      "     39, Generating update device(rhs[:ny*nx],Aref[:ny*nx])\n",
+      "     42, Generating Tesla code\n",
+      "         43, #pragma acc loop gang /* blockIdx.x */\n",
+      "         44, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "         49, Generating implicit reduction(max:error)\n",
+      "     44, Loop is parallelizable\n",
+      "     53, Generating Tesla code\n",
+      "         54, #pragma acc loop gang /* blockIdx.x */\n",
+      "         55, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "     55, Loop is parallelizable\n",
+      "     61, Generating Tesla code\n",
+      "         62, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "     66, Generating Tesla code\n",
+      "         67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "     78, Generating update self(Aref[:ny*nx])\n",
+      "mpicc -DUSE_DOUBLE -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution\n",
+      "poisson2d.solution.c:\n",
+      "main:\n",
+      "     71, Generating enter data create(rhs[:ny*nx],Aref[:ny*nx],A[:ny*nx],Anew[:ny*nx])\n",
+      "     87, Generating present(Aref[:],A[:])\n",
+      "         Generating Tesla code\n",
+      "         88, #pragma acc loop gang /* blockIdx.x */\n",
+      "         89, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "     89, Loop is parallelizable\n",
+      "    140, Generating update device(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)],rhs[nx*iy_start:nx*(iy_end-iy_start)])\n",
+      "    143, Generating present(A[:],rhs[:],Anew[:])\n",
+      "         Generating Tesla code\n",
+      "        144, #pragma acc loop gang /* blockIdx.x */\n",
+      "        145, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "        149, Generating implicit reduction(max:error)\n",
+      "    145, Loop is parallelizable\n",
+      "    157, Generating present(Anew[:],A[:])\n",
+      "         Generating Tesla code\n",
+      "        158, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "    163, Generating present(Anew[:],A[:])\n",
+      "         Generating Tesla code\n",
+      "        164, #pragma acc loop gang /* blockIdx.x */\n",
+      "        165, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "    165, Loop is parallelizable\n",
+      "    191, Generating present(A[:])\n",
+      "         Generating Tesla code\n",
+      "        192, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "    202, Generating update self(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)])\n",
+      "    220, Generating exit data delete(rhs[:1],Aref[:1],A[:1],Anew[:1])\n"
      ]
     }
    ],
    "source": [
-    "checkdir('task0')\n",
+    "checkdir('task3')\n",
     "!make poisson2d.solution"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
+   "execution_count": 19,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS ./poisson2d.solution\n",
-      "Job <4697> is submitted to default queue <batch>.\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" ./poisson2d.solution\n",
+      "Job <25201> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
-      "Jacobi relaxation Calculation: 2048 x 2048 mesh\n",
-      "Calculate reference solution and time serial CPU execution.\n",
-      "    0, 0.249999\n",
-      "  100, 0.249760\n",
-      "  200, 0.249522\n",
-      "  300, 0.249285\n",
-      "  400, 0.249048\n",
-      "GPU execution.\n",
-      "    0, 0.249999\n",
-      "  100, 0.249760\n",
-      "  200, 0.249522\n",
-      "  300, 0.249285\n",
-      "  400, 0.249048\n",
-      "2048x2048: 1 CPU:   5.5979 s, 1 GPU:   0.2241 s, speedup:    24.98\n"
+      "Jacobi relaxation Calculation: 4096 x 4096 mesh\n",
+      "Calculate reference solution and time serial execution.\n",
+      "    0, 0.250000\n",
+      "  100, 0.249940\n",
+      "  200, 0.249880\n",
+      "  300, 0.249821\n",
+      "  400, 0.249761\n",
+      "  500, 0.249702\n",
+      "  600, 0.249642\n",
+      "  700, 0.249583\n",
+      "  800, 0.249524\n",
+      "  900, 0.249464\n",
+      "Parallel execution.\n",
+      "    0, 0.250000\n",
+      "  100, 0.249940\n",
+      "  200, 0.249880\n",
+      "  300, 0.249821\n",
+      "  400, 0.249761\n",
+      "  500, 0.249702\n",
+      "  600, 0.249642\n",
+      "  700, 0.249583\n",
+      "  800, 0.249524\n",
+      "  900, 0.249464\n",
+      "Num GPUs: 2.\n",
+      "4096x4096: 1 GPU:   1.3175 s, 2 GPUs:   0.6962 s, speedup:     1.89, efficiency:    94.62%\n",
+      "MPI time:   0.0583 s, inter GPU BW:     2.09 GiB/s\n"
      ]
     }
    ],
    "source": [
-    "checkdir('task0')\n",
-    "!make run.solution"
+    "checkdir('task3')\n",
+    "!NP=2 make run.solution"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
+   "execution_count": 20,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off  -o /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof ./poisson2d.solution 10\n",
-      "Job <4698> is submitted to default queue <batch>.\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task3.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10\n",
+      "Job <25202> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
-      "==33475== PGPROF is profiling process 33475, command: ./poisson2d.solution 10\n",
-      "==33475== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof\n",
-      "Jacobi relaxation Calculation: 2048 x 2048 mesh\n",
-      "Calculate reference solution and time serial CPU execution.\n",
-      "    0, 0.249999\n",
-      "GPU execution.\n",
-      "    0, 0.249999\n",
-      "2048x2048: 1 CPU:   0.1245 s, 1 GPU:   0.0220 s, speedup:     5.66\n",
-      "mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.pgprof .\n"
+      "==93249== PGPROF is profiling process 93249, command: ./poisson2d.solution 10\n",
+      "==93248== PGPROF is profiling process 93248, command: ./poisson2d.solution 10\n",
+      "==93249== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task3.NP2.1.pgprof\n",
+      "Jacobi relaxation Calculation: 4096 x 4096 mesh\n",
+      "Calculate reference solution and time serial execution.\n",
+      "    0, 0.250000\n",
+      "Parallel execution.\n",
+      "    0, 0.250000\n",
+      "Num GPUs: 2.\n",
+      "4096x4096: 1 GPU:   0.0262 s, 2 GPUs:   0.0127 s, speedup:     2.06, efficiency:   103.02%\n",
+      "MPI time:   0.0009 s, inter GPU BW:     1.39 GiB/s\n",
+      "==93248== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task3.NP2.0.pgprof\n",
+      "mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task3.NP2.?.pgprof  .\n",
+      "tar -cvzf pgprof.poisson2d.Task3.solution.tar.gz poisson2d.solution.Task3.NP2.?.pgprof\n",
+      "poisson2d.solution.Task3.NP2.0.pgprof\n",
+      "poisson2d.solution.Task3.NP2.1.pgprof\n"
      ]
     }
    ],
    "source": [
-    "checkdir('task0')\n",
-    "!make profile.solution"
+    "checkdir('task3')\n",
+    "!NP=2 make profile.solution"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
+   "source": [
+    "#### Scaling\n",
+    "\n",
+    "You can do a simple scaling run for up to all 6 GPUs in the node by executing the next cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "exercise": "solution"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>GPUs</th>\n",
+       "      <th>time [s]</th>\n",
+       "      <th>speedup</th>\n",
+       "      <th>efficiency</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.3935</td>\n",
+       "      <td>0.94,</td>\n",
+       "      <td>93.86%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>0.6910</td>\n",
+       "      <td>1.89,</td>\n",
+       "      <td>94.52%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>4</td>\n",
+       "      <td>0.3920</td>\n",
+       "      <td>3.37,</td>\n",
+       "      <td>84.13%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>6</td>\n",
+       "      <td>0.2841</td>\n",
+       "      <td>4.58,</td>\n",
+       "      <td>76.29%</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   GPUs  time [s] speedup efficiency\n",
+       "0     1    1.3935   0.94,     93.86%\n",
+       "1     2    0.6910   1.89,     94.52%\n",
+       "2     4    0.3920   3.37,     84.13%\n",
+       "3     6    0.2841   4.58,     76.29%"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "checkdir('task3')\n",
+    "!NP=1 make run.solution | grep speedup > scale.out\n",
+    "!NP=2 make run.solution | grep speedup >> scale.out\n",
+    "!NP=4 make run.solution | grep speedup >>  scale.out\n",
+    "!NP=6 make run.solution | grep speedup >>  scale.out\n",
+    "data_frameS3 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
+    "\n",
+    "!rm scale.out\n",
+    "\n",
+    "data_frameS3b=data_frameS3.iloc[:,[5,7,10,12]].copy()\n",
+    "data_frameS3b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
+    "The overlap of compute and communication can be seen in the profiler, e.g. as shown below.\n",
+    "\n",
+    "![Solution3.png](./resources/Solution3.png)\n",
+    "\n",
+    "\n",
     "[Back to Top](#top)\n",
     "\n",
     "---"
@@ -896,190 +1536,292 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "## Solution 1:<a name=\"solution1\"></a>\n",
+    "## Solution 4:<a name=\"solution4\"></a>\n",
     "\n",
-    "Swap the `ix` and `iy` loops to make sure that `ix` is the fastest running index \n",
     "\n",
+    "Include NVSHMEM headers\n",
+    "\n",
+    "```C\n",
+    "#include <nvshmem.h>\n",
+    "#include <nvshmemx.h>\n",
+    "```\n",
+    "and initalize NVSHMEM with MPI\n",
+    "```C\n",
+    "MPI_Comm mpi_comm = MPI_COMM_WORLD;\n",
+    "nvshmemx_init_attr_t attr;\n",
+    "attr.mpi_comm = &mpi_comm;\n",
+    "nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);\n",
+    "```\n",
+    "    \n",
+    "Allocate device memory and map it top the host allocation for OpenACC\n",
+    "```C\n",
+    "real *d_A = (real *)nvshmem_malloc(nx * ny * sizeof(real));\n",
+    "map(A, d_A, nx * ny * sizeof(real));\n",
+    "```\n",
+    "\n",
+    "Calculate the right locations on the remote GPUs and communicate data\n",
     "```C\n",
-    "#pragma acc parallel loop\n",
-    "for (int iy = iy_start; iy < iy_end; iy++)\n",
+    "// Periodic boundary conditions\n",
+    "int top = (rank == 0) ? (size - 1) : rank - 1;\n",
+    "int bottom = (rank == (size - 1)) ? 0 : rank + 1;\n",
+    "int iy_start_top = top * chunk_size;\n",
+    "int iy_end_top = iy_start_top + chunk_size;\n",
+    "\n",
+    "// Do not process boundaries\n",
+    "iy_start_top = max(iy_start_top, 1);\n",
+    "iy_end_top = min(iy_end_top, ny - 1);\n",
+    "\n",
+    "int iy_start_bottom = bottom * chunk_size;\n",
+    "int iy_end_bottom = iy_start_bottom + chunk_size;\n",
+    "\n",
+    "// Do not process boundaries\n",
+    "iy_start_bottom = max(iy_start_bottom, 1);\n",
+    "iy_end_bottom = min(iy_end_bottom, ny - 1);\n",
+    "\n",
+    "// Halo exchange\n",
+    "#pragma acc host_data use_device(A)\n",
     "{\n",
-    "    for( int ix = ix_start; ix < ix_end; ix++ )\n",
-    "    {\n",
-    "        Anew[iy*nx+ix] = -0.25 * (rhs[iy*nx+ix] - ( A[iy*nx+ix+1] + A[iy*nx+ix-1]\n",
-    "                                               + A[(iy-1)*nx+ix] + A[(iy+1)*nx+ix] ));\n",
-    "        error = fmaxr( error, fabsr(Anew[iy*nx+ix]-A[iy*nx+ix]));\n",
-    "    }\n",
+    "    double start_mpi = MPI_Wtime();\n",
+    "    nvshmem_double_put((double *)(A + iy_end_top * nx + ix_start),\n",
+    "                       (double *)(A + iy_start * nx + ix_start), (ix_end - ix_start), top);\n",
+    "    nvshmem_double_put((double *)(A + (iy_start_bottom - 1) * nx + ix_start),\n",
+    "                       (double *)(A + (iy_end - 1) * nx + ix_start), (ix_end - ix_start),\n",
+    "                       bottom);\n",
+    "    nvshmem_barrier_all();\n",
+    "    mpi_time += MPI_Wtime() - start_mpi;\n",
     "}\n",
     "```\n",
     "\n",
-    "#### Code\n",
-    "\n",
-    "* [C Version](/edit/C/task1/poisson2d.solution.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task1/poisson2d.solution.F03)\n",
+    "Finally, remember to deallocate:\n",
+    "```C\n",
+    "nvshmem_free(d_A);\n",
+    "```\n",
     "\n",
-    "#### File browser\n",
+    "#### Code\n",
     "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task1/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task1/)"
+    "* [C Version](./C/task4/poisson2d.solution.c)"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
     "#### Compiling, Running and Profiling\n",
     "\n",
-    "You can compile, run and profile the solution with the next cells.  __After__ the profiling finished the output file `poisson2d.solution.pgprof`  can be downloaded from here: [C Version](/tree/C/task1/pgprof.poisson2d.Task1.solution.tar.gz?download=1) / [Fortran Version](/tree/FORTRAN/task1/pgprof.poisson2d.Task1.solution.tar.gz?download=1).  "
+    "You can compile, run and profile the solution with the next cells.  You can profile the code by executing the next cell. __After__ the profiling completed download the tarball containing the profiles (`pgprof.Task4.solution.poisson2d.tar.gz`) with the File Browser. \n",
+    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu. Remember to use the _Multiple processes_ option in the assistant.    "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
+   "execution_count": 22,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/autofs/nccsopen-svm1_home/mathiasw/sc17task/C/task1\n"
+      "/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task4\n"
      ]
     }
    ],
    "source": [
-    "%cd $basedir/task1"
+    "%cd $basedir/task4"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
+   "execution_count": 23,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "make: `poisson2d.solution' is up to date.\r\n"
+      "mpicxx -c -DUSE_DOUBLE  -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d_serial.c -o poisson2d_serial.o\n",
+      "poisson2d_serial(int, int, double, double *, double *, int, int, const double *):\n",
+      "     37, Generating present(Anew[:],rhs[:],Aref[:])\n",
+      "     39, Generating update device(rhs[:ny*nx],Aref[:ny*nx])\n",
+      "     40, Generating Tesla code\n",
+      "         43, #pragma acc loop gang /* blockIdx.x */\n",
+      "         44, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "         49, Generating implicit reduction(max:error)\n",
+      "     44, Loop is parallelizable\n",
+      "     51, Generating Tesla code\n",
+      "         54, #pragma acc loop gang /* blockIdx.x */\n",
+      "         55, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "     55, Loop is parallelizable\n",
+      "     58, Generating Tesla code\n",
+      "         62, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "     65, Generating Tesla code\n",
+      "         67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "     77, Generating update self(Aref[:ny*nx])\n",
+      "mpicxx -DUSE_DOUBLE  -Minfo=accel -fast -acc -ta=tesla:cc70,pinned -I/gpfs/wolf/trn003/world-shared/software/nvshmem//include poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution -L/gpfs/wolf/trn003/world-shared/software/nvshmem//lib -lnvshmem -Mcuda -lcuda -lrt \n",
+      "poisson2d.solution.c:\n",
+      "main:\n",
+      "     90, Generating enter data create(Aref[:ny*nx],rhs[:ny*nx],A[:ny*nx],Anew[:ny*nx])\n",
+      "    101, Generating present(Aref[:],A[:])\n",
+      "         Generating Tesla code\n",
+      "        105, #pragma acc loop gang /* blockIdx.x */\n",
+      "        106, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "    106, Loop is parallelizable\n",
+      "    162, Generating update device(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)],rhs[nx*iy_start:nx*(iy_end-iy_start)])\n",
+      "    163, Generating present(A[:],rhs[:],Anew[:])\n",
+      "         Generating Tesla code\n",
+      "        166, #pragma acc loop gang /* blockIdx.x */\n",
+      "        167, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "        171, Generating implicit reduction(max:error)\n",
+      "    167, Loop is parallelizable\n",
+      "    177, Generating present(Anew[:],A[:])\n",
+      "         Generating Tesla code\n",
+      "        180, #pragma acc loop gang /* blockIdx.x */\n",
+      "        181, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "    181, Loop is parallelizable\n",
+      "    214, Generating present(A[:])\n",
+      "         Generating Tesla code\n",
+      "        217, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "    227, Generating update self(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)])\n",
+      "    246, Generating exit data delete(rhs[:1],Aref[:1],A[:1],Anew[:1])\n"
      ]
     }
    ],
    "source": [
-    "checkdir('task1')\n",
+    "checkdir('task4')\n",
     "!make poisson2d.solution"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
+   "execution_count": 24,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS ./poisson2d.solution\n",
-      "Job <4699> is submitted to default queue <batch>.\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" ./poisson2d.solution\n",
+      "Job <25207> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
-      "Jacobi relaxation Calculation: 2048 x 2048 mesh\n",
-      "Calculate reference solution and time serial CPU execution.\n",
-      "    0, 0.249999\n",
-      "  100, 0.249760\n",
-      "  200, 0.249522\n",
-      "  300, 0.249285\n",
-      "  400, 0.249048\n",
-      "GPU execution.\n",
-      "    0, 0.249999\n",
-      "  100, 0.249760\n",
-      "  200, 0.249522\n",
-      "  300, 0.249285\n",
-      "  400, 0.249048\n",
-      "2048x2048: 1 CPU:   5.5086 s, 1 GPU:   0.2293 s, speedup:    24.02\n"
+      "WARN: IB HCA and GPU are not connected to a PCIe switch so IB performance can be limited depending on the CPU generation \n",
+      "Jacobi relaxation Calculation: 4096 x 4096 mesh\n",
+      "Calculate reference solution and time serial execution.\n",
+      "    0, 0.250000\n",
+      "  100, 0.249940\n",
+      "  200, 0.249880\n",
+      "  300, 0.249821\n",
+      "  400, 0.249761\n",
+      "  500, 0.249702\n",
+      "  600, 0.249642\n",
+      "  700, 0.249583\n",
+      "  800, 0.249524\n",
+      "  900, 0.249464\n",
+      "Parallel execution.\n",
+      "    0, 0.250000\n",
+      "  100, 0.249940\n",
+      "  200, 0.249880\n",
+      "  300, 0.249821\n",
+      "  400, 0.249761\n",
+      "  500, 0.249702\n",
+      "  600, 0.249642\n",
+      "  700, 0.249583\n",
+      "  800, 0.249524\n",
+      "  900, 0.249464\n",
+      "Num GPUs: 2.\n",
+      "4096x4096: 1 GPU:   1.3171 s, 2 GPUs:   0.7377 s, speedup:     1.79, efficiency:    89.27%\n",
+      "MPI time:   0.0686 s, inter GPU BW:     1.78 GiB/s\n"
      ]
     }
    ],
    "source": [
-    "checkdir('task1')\n",
-    "!make run.solution"
+    "checkdir('task4')\n",
+    "!NP=2 make run.solution"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
+   "execution_count": 25,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off  -o /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.timeline.pgprof ./poisson2d.solution 3\n",
-      "Job <4700> is submitted to default queue <batch>.\n",
-      "<<Waiting for dispatch ...>>\n",
-      "<<Starting on login1>>\n",
-      "==78449== PGPROF is profiling process 78449, command: ./poisson2d.solution 3\n",
-      "==78449== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.timeline.pgprof\n",
-      "Jacobi relaxation Calculation: 2048 x 2048 mesh\n",
-      "Calculate reference solution and time serial CPU execution.\n",
-      "    0, 0.249999\n",
-      "GPU execution.\n",
-      "    0, 0.249999\n",
-      "2048x2048: 1 CPU:   0.0476 s, 1 GPU:   0.0190 s, speedup:     2.51\n",
-      "bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off  --analysis-metrics -o /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.metrics.pgprof ./poisson2d.solution 3\n",
-      "Job <4701> is submitted to default queue <batch>.\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task4.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10\n",
+      "Job <25208> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
-      "==33668== PGPROF is profiling process 33668, command: ./poisson2d.solution 3\n",
-      "==33668== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.\n",
-      "==33668== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.metrics.pgprof\n",
-      "Jacobi relaxation Calculation: 2048 x 2048 mesh\n",
-      "Calculate reference solution and time serial CPU execution.\n",
-      "    0, 0.249999\n",
-      "GPU execution.\n",
-      "    0, 0.249999\n",
-      "2048x2048: 1 CPU:   0.0490 s, 1 GPU:  15.6526 s, speedup:     0.00\n",
-      "bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS pgprof -f --cpu-profiling off  --metrics gld_efficiency,gst_efficiency -o /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.efficiency.pgprof ./poisson2d.solution 3\n",
-      "Job <4702> is submitted to default queue <batch>.\n",
-      "<<Waiting for dispatch ...>>\n",
-      "<<Starting on login1>>\n",
-      "==78646== PGPROF is profiling process 78646, command: ./poisson2d.solution 3\n",
-      "==78646== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.\n",
-      "==78646== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.efficiency.pgprof\n",
-      "Jacobi relaxation Calculation: 2048 x 2048 mesh\n",
-      "Calculate reference solution and time serial CPU execution.\n",
-      "    0, 0.249999\n",
-      "GPU execution.\n",
-      "    0, 0.249999\n",
-      "2048x2048: 1 CPU:   0.0489 s, 1 GPU:   0.6829 s, speedup:     0.07\n",
-      "pgprof --csv -i /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.efficiency.pgprof 2>&1 | grep -v \"======\" > poisson2d.solution.efficiency.csv\n",
-      "mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.*.pgprof .\n",
-      "tar -cvzf pgprof.poisson2d.Task1.solution.tar.gz  poisson2d.solution.*.pgprof\n",
-      "poisson2d.solution.efficiency.pgprof\n",
-      "poisson2d.solution.metrics.pgprof\n",
-      "poisson2d.solution.timeline.pgprof\n"
+      "==93971== PGPROF is profiling process 93971, command: ./poisson2d.solution 10\n",
+      "==93970== PGPROF is profiling process 93970, command: ./poisson2d.solution 10\n",
+      "==93971== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task4.NP2.0.pgprof\n",
+      "WARN: IB HCA and GPU are not connected to a PCIe switch so IB performance can be limited depending on the CPU generation \n",
+      "Jacobi relaxation Calculation: 4096 x 4096 mesh\n",
+      "Calculate reference solution and time serial execution.\n",
+      "    0, 0.250000\n",
+      "Parallel execution.\n",
+      "    0, 0.250000\n",
+      "Num GPUs: 2.\n",
+      "4096x4096: 1 GPU:   0.0225 s, 2 GPUs:   0.0132 s, speedup:     1.71, efficiency:    85.34%\n",
+      "MPI time:   0.0010 s, inter GPU BW:     1.24 GiB/s\n",
+      "==93970== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task4.NP2.1.pgprof\n",
+      "mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task4.NP2.?.pgprof  .\n",
+      "tar -cvzf pgprof.poisson2d.Task4.solution.tar.gz poisson2d.solution.Task4.NP2.?.pgprof\n",
+      "poisson2d.solution.Task4.NP2.0.pgprof\n",
+      "poisson2d.solution.Task4.NP2.1.pgprof\n"
      ]
     }
    ],
    "source": [
-    "checkdir('task1')\n",
-    "!make profile.solution"
+    "checkdir('task4')\n",
+    "!NP=2 make profile.solution"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "For the _Global Memory Load/Store Efficiency_ the `make profile` command also generated a CSV file that you can import and view with the cell below.  \n",
-    "If you purely work in a terminal you can view the same output by running `pgprof -i poisson2d.efficiency.solution.pgprof`."
+    "#### Scaling\n",
+    "\n",
+    "You can do a simple scaling run for up to all 6 GPUs in the node by executing the next cell."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
+   "execution_count": 26,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n",
+      "<<Waiting for dispatch ...>>\n",
+      "<<Starting on login1>>\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
@@ -1101,197 +1843,82 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>Device</th>\n",
-       "      <th>Kernel</th>\n",
-       "      <th>Invocations</th>\n",
-       "      <th>Metric Name</th>\n",
-       "      <th>Metric Description</th>\n",
-       "      <th>Min</th>\n",
-       "      <th>Max</th>\n",
-       "      <th>Avg</th>\n",
+       "      <th>GPUs</th>\n",
+       "      <th>time [s]</th>\n",
+       "      <th>speedup</th>\n",
+       "      <th>efficiency</th>\n",
        "    </tr>\n",
        "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
-       "      <td>main_70_gpu</td>\n",
-       "      <td>1</td>\n",
-       "      <td>gld_efficiency</td>\n",
-       "      <td>Global Memory Load Efficiency</td>\n",
-       "      <td>0.000000%</td>\n",
-       "      <td>0.000000%</td>\n",
-       "      <td>0.000000%</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
-       "      <td>main_70_gpu</td>\n",
-       "      <td>1</td>\n",
-       "      <td>gst_efficiency</td>\n",
-       "      <td>Global Memory Store Efficiency</td>\n",
-       "      <td>100.000000%</td>\n",
-       "      <td>100.000000%</td>\n",
-       "      <td>100.000000%</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
-       "      <td>main_95_gpu</td>\n",
-       "      <td>3</td>\n",
-       "      <td>gld_efficiency</td>\n",
-       "      <td>Global Memory Load Efficiency</td>\n",
-       "      <td>91.879935%</td>\n",
-       "      <td>91.897053%</td>\n",
-       "      <td>91.888339%</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
-       "      <td>main_95_gpu</td>\n",
-       "      <td>3</td>\n",
-       "      <td>gst_efficiency</td>\n",
-       "      <td>Global Memory Store Efficiency</td>\n",
-       "      <td>88.845486%</td>\n",
-       "      <td>88.845486%</td>\n",
-       "      <td>88.845486%</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
-       "      <td>main_102_gpu__red</td>\n",
-       "      <td>3</td>\n",
-       "      <td>gld_efficiency</td>\n",
-       "      <td>Global Memory Load Efficiency</td>\n",
-       "      <td>99.756335%</td>\n",
-       "      <td>99.756335%</td>\n",
-       "      <td>99.756335%</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
-       "      <td>main_102_gpu__red</td>\n",
-       "      <td>3</td>\n",
-       "      <td>gst_efficiency</td>\n",
-       "      <td>Global Memory Store Efficiency</td>\n",
-       "      <td>25.000000%</td>\n",
-       "      <td>25.000000%</td>\n",
-       "      <td>25.000000%</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
-       "      <td>main_122_gpu</td>\n",
-       "      <td>3</td>\n",
-       "      <td>gld_efficiency</td>\n",
-       "      <td>Global Memory Load Efficiency</td>\n",
-       "      <td>25.000000%</td>\n",
-       "      <td>25.000000%</td>\n",
-       "      <td>25.000000%</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
-       "      <td>main_122_gpu</td>\n",
-       "      <td>3</td>\n",
-       "      <td>gst_efficiency</td>\n",
-       "      <td>Global Memory Store Efficiency</td>\n",
-       "      <td>25.000000%</td>\n",
-       "      <td>25.000000%</td>\n",
-       "      <td>25.000000%</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
-       "      <td>main_106_gpu</td>\n",
-       "      <td>3</td>\n",
-       "      <td>gld_efficiency</td>\n",
-       "      <td>Global Memory Load Efficiency</td>\n",
-       "      <td>91.823101%</td>\n",
-       "      <td>91.890100%</td>\n",
-       "      <td>91.851075%</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
-       "      <td>main_106_gpu</td>\n",
-       "      <td>3</td>\n",
-       "      <td>gst_efficiency</td>\n",
-       "      <td>Global Memory Store Efficiency</td>\n",
-       "      <td>88.956522%</td>\n",
-       "      <td>88.956522%</td>\n",
-       "      <td>88.956522%</td>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1.3685</td>\n",
+       "      <td>0.96,</td>\n",
+       "      <td>96.08%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
-       "      <td>main_116_gpu</td>\n",
-       "      <td>3</td>\n",
-       "      <td>gld_efficiency</td>\n",
-       "      <td>Global Memory Load Efficiency</td>\n",
-       "      <td>94.722222%</td>\n",
-       "      <td>94.722222%</td>\n",
-       "      <td>94.722222%</td>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>0.7472</td>\n",
+       "      <td>1.78,</td>\n",
+       "      <td>88.90%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>Tesla V100-SXM2-16GB (0)</td>\n",
-       "      <td>main_116_gpu</td>\n",
-       "      <td>3</td>\n",
-       "      <td>gst_efficiency</td>\n",
-       "      <td>Global Memory Store Efficiency</td>\n",
-       "      <td>88.956522%</td>\n",
-       "      <td>88.956522%</td>\n",
-       "      <td>88.956522%</td>\n",
+       "      <th>2</th>\n",
+       "      <td>4</td>\n",
+       "      <td>0.4605</td>\n",
+       "      <td>2.85,</td>\n",
+       "      <td>71.27%</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>6</td>\n",
+       "      <td>0.3612</td>\n",
+       "      <td>3.60,</td>\n",
+       "      <td>60.05%</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                      Device             Kernel  Invocations     Metric Name  \\\n",
-       "0   Tesla V100-SXM2-16GB (0)        main_70_gpu            1  gld_efficiency   \n",
-       "1   Tesla V100-SXM2-16GB (0)        main_70_gpu            1  gst_efficiency   \n",
-       "2   Tesla V100-SXM2-16GB (0)        main_95_gpu            3  gld_efficiency   \n",
-       "3   Tesla V100-SXM2-16GB (0)        main_95_gpu            3  gst_efficiency   \n",
-       "4   Tesla V100-SXM2-16GB (0)  main_102_gpu__red            3  gld_efficiency   \n",
-       "5   Tesla V100-SXM2-16GB (0)  main_102_gpu__red            3  gst_efficiency   \n",
-       "6   Tesla V100-SXM2-16GB (0)       main_122_gpu            3  gld_efficiency   \n",
-       "7   Tesla V100-SXM2-16GB (0)       main_122_gpu            3  gst_efficiency   \n",
-       "8   Tesla V100-SXM2-16GB (0)       main_106_gpu            3  gld_efficiency   \n",
-       "9   Tesla V100-SXM2-16GB (0)       main_106_gpu            3  gst_efficiency   \n",
-       "10  Tesla V100-SXM2-16GB (0)       main_116_gpu            3  gld_efficiency   \n",
-       "11  Tesla V100-SXM2-16GB (0)       main_116_gpu            3  gst_efficiency   \n",
-       "\n",
-       "                Metric Description          Min          Max          Avg  \n",
-       "0    Global Memory Load Efficiency    0.000000%    0.000000%    0.000000%  \n",
-       "1   Global Memory Store Efficiency  100.000000%  100.000000%  100.000000%  \n",
-       "2    Global Memory Load Efficiency   91.879935%   91.897053%   91.888339%  \n",
-       "3   Global Memory Store Efficiency   88.845486%   88.845486%   88.845486%  \n",
-       "4    Global Memory Load Efficiency   99.756335%   99.756335%   99.756335%  \n",
-       "5   Global Memory Store Efficiency   25.000000%   25.000000%   25.000000%  \n",
-       "6    Global Memory Load Efficiency   25.000000%   25.000000%   25.000000%  \n",
-       "7   Global Memory Store Efficiency   25.000000%   25.000000%   25.000000%  \n",
-       "8    Global Memory Load Efficiency   91.823101%   91.890100%   91.851075%  \n",
-       "9   Global Memory Store Efficiency   88.956522%   88.956522%   88.956522%  \n",
-       "10   Global Memory Load Efficiency   94.722222%   94.722222%   94.722222%  \n",
-       "11  Global Memory Store Efficiency   88.956522%   88.956522%   88.956522%  "
+       "   GPUs  time [s] speedup efficiency\n",
+       "0     1    1.3685   0.96,     96.08%\n",
+       "1     2    0.7472   1.78,     88.90%\n",
+       "2     4    0.4605   2.85,     71.27%\n",
+       "3     6    0.3612   3.60,     60.05%"
       ]
      },
-     "execution_count": 30,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "data_frame_solution = pandas.read_csv('poisson2d.solution.efficiency.csv', sep=',')\n",
-    "data_frame_solution"
+    "checkdir('task4')\n",
+    "!NP=1 make run.solution | grep speedup > scale.out\n",
+    "!NP=2 make run.solution | grep speedup >> scale.out\n",
+    "!NP=4 make run.solution | grep speedup >>  scale.out\n",
+    "!NP=6 make run.solution | grep speedup >>  scale.out\n",
+    "data_frameS4 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
+    "\n",
+    "!rm scale.out\n",
+    "\n",
+    "data_frameS4b=data_frameS4.iloc[:,[5,7,10,12]].copy()\n",
+    "data_frameS4b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
+    "The communication using NVSHMEM and the barrier executed as a kernel on the device can be seen in the profiler, e.g. as shown below.\n",
+    "\n",
+    "![Solution4.png](./resources/Solution4.png)\n",
+    "\n",
     "[Back to Top](#top)\n",
     "\n",
     "---"
@@ -1299,129 +1926,150 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "## Solution 2:<a name=\"solution2\"></a>\n",
-    "\n",
-    "Set the GPU used by the rank using `#pragma acc set device_num`\n",
-    "```C\n",
-    "//Initialize MPI and determine rank and size\n",
-    "MPI_Init(&argc, &argv);\n",
-    "MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n",
-    "MPI_Comm_size(MPI_COMM_WORLD, &size);\n",
-    "\n",
-    "#pragma acc set device_num( rank )\n",
-    "\n",
-    "real* restrict const A    = (real*) malloc(nx*ny*sizeof(real));\n",
-    "real* restrict const Aref = (real*) malloc(nx*ny*sizeof(real));\n",
-    "real* restrict const Anew = (real*) malloc(nx*ny*sizeof(real));\n",
-    "real* restrict const rhs  = (real*) malloc(nx*ny*sizeof(real));\n",
-    "```\n",
+    "## Solution 5:<a name=\"solution5\"></a>\n",
     "\n",
+    "Basically all kernels in the `while` loop can use the async keyword. Please take a look in the solution source code. They will all use the OpenACC default async queue.\n",
     "\n",
-    "Apply domain decomposition\n",
+    "To also place the halo exchange in the queue use:\n",
     "```C\n",
-    "// Ensure correctness if ny%size != 0\n",
-    "int chunk_size = ceil( (1.0*ny)/size );\n",
-    "\n",
-    "int iy_start = rank * chunk_size;\n",
-    "int iy_end   = iy_start + chunk_size;\n",
-    "\n",
-    "// Do not process boundaries\n",
-    "iy_start = max( iy_start, 1 );\n",
-    "iy_end = min( iy_end, ny - 1 );\n",
+    "#pragma acc host_data use_device(A)\n",
+    "{\n",
+    "    nvshmemx_double_put_on_stream(\n",
+    "        (double *)(A + iy_end_top * nx + ix_start),\n",
+    "        (double *)(A + iy_start * nx + ix_start), (ix_end - ix_start), top,\n",
+    "        (cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));\n",
+    "    nvshmemx_double_put_on_stream(\n",
+    "        (double *)(A + (iy_start_bottom - 1) * nx + ix_start),\n",
+    "        (double *)(A + (iy_end - 1) * nx + ix_start), (ix_end - ix_start), bottom,\n",
+    "        (cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));\n",
+    "}\n",
+    "nvshmemx_barrier_all_on_stream((cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));\n",
     "```\n",
     "\n",
-    "Exchange data\n",
+    "Finally when copying out data make sure to wait on all device computation first:\n",
     "```C\n",
-    "//Periodic boundary conditions\n",
-    "int top    = (rank == 0) ? (size-1) : rank-1;\n",
-    "int bottom = (rank == (size-1)) ? 0 : rank+1;\n",
-    "#pragma acc host_data use_device( A )\n",
-    "{\n",
-    "    double start_mpi = MPI_Wtime();\n",
-    "    //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom\n",
-    "    MPI_Sendrecv( A+iy_start*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,\n",
-    "                  A+iy_end*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,\n",
-    "                  MPI_COMM_WORLD, MPI_STATUS_IGNORE );\n",
-    "\n",
-    "    //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top\n",
-    "    MPI_Sendrecv( A+(iy_end-1)*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,\n",
-    "                  A+(iy_start-1)*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,\n",
-    "                  MPI_COMM_WORLD, MPI_STATUS_IGNORE );\n",
-    "    mpi_time += MPI_Wtime() - start_mpi;\n",
-    "}\n",
+    "#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx]) wait\n",
     "```\n",
     "\n",
     "#### Code\n",
     "\n",
-    "* [C Version](/edit/C/task2/poisson2d.solution.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task2/poisson2d.solution.F03)\n",
-    "\n",
-    "#### File browser\n",
-    "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task2/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task2/)"
+    "* [C Version](/edit/C/task5/poisson2d.solution.c)"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
     "#### Compiling, Running and Profiling\n",
     "\n",
-    "You can compile, run and profile the solution with the next cells.  __After__ the profiling finished the output file `poisson2d.solution.pgprof`  can be downloaded from here: [C Version](/tree/C/task2/pgprof.poisson2d.Task2.solution.tar.gz?download=1) / [Fortran Version](/tree/FORTRAN/task2/pgprof.poisson2d.Task2.solution.tar.gz?download=1).    "
+    "You can compile, run and profile the solution with the next cells.  You can profile the code by executing the next cell. __After__ the profiling completed download the tarball containing the profiles (`pgprof.Task5.solution.poisson2d.tar.gz`) with the File Browser. \n",
+    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu. Remember to use the _Multiple processes_ option in the assistant.     "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
+   "execution_count": 27,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/autofs/nccsopen-svm1_home/mathiasw/sc17task/C/task2p\n"
+      "/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task5\n"
      ]
     }
    ],
    "source": [
-    "%cd $basedir/task2"
+    "%cd $basedir/task5"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
+   "execution_count": 28,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "make: `poisson2d.solution' is up to date.\r\n"
+      "mpicxx -c -DUSE_DOUBLE  -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d_serial.c -o poisson2d_serial.o\n",
+      "poisson2d_serial(int, int, double, double *, double *, int, int, const double *):\n",
+      "     37, Generating present(Anew[:],rhs[:],Aref[:])\n",
+      "     39, Generating update device(rhs[:ny*nx],Aref[:ny*nx])\n",
+      "     40, Generating Tesla code\n",
+      "         43, #pragma acc loop gang /* blockIdx.x */\n",
+      "         44, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "         49, Generating implicit reduction(max:error)\n",
+      "     44, Loop is parallelizable\n",
+      "     51, Generating Tesla code\n",
+      "         54, #pragma acc loop gang /* blockIdx.x */\n",
+      "         55, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "     55, Loop is parallelizable\n",
+      "     58, Generating Tesla code\n",
+      "         62, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "     65, Generating Tesla code\n",
+      "         67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "     77, Generating update self(Aref[:ny*nx])\n",
+      "mpicxx -DUSE_DOUBLE  -Minfo=accel -fast -acc -ta=tesla:cc70,pinned -I/ccsopen/home/mathiasw/nvshmem-master/build/include poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution -L/ccsopen/home/mathiasw/nvshmem-master/build/lib -lnvshmem -Mcuda -lcuda -lrt \n",
+      "poisson2d.solution.c:\n",
+      "main:\n",
+      "     90, Generating enter data create(Aref[:ny*nx],rhs[:ny*nx],A[:ny*nx],Anew[:ny*nx])\n",
+      "    101, Generating present(Aref[:],A[:])\n",
+      "         Generating Tesla code\n",
+      "        105, #pragma acc loop gang /* blockIdx.x */\n",
+      "        106, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "    106, Loop is parallelizable\n",
+      "    137, Generating update device(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)],rhs[nx*iy_start:nx*(iy_end-iy_start)])\n",
+      "    138, Generating present(A[:],rhs[:],Anew[:])\n",
+      "         Generating Tesla code\n",
+      "        141, #pragma acc loop gang /* blockIdx.x */\n",
+      "        142, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "        146, Generating implicit reduction(max:error)\n",
+      "    142, Loop is parallelizable\n",
+      "    152, Generating present(Anew[:],A[:])\n",
+      "         Generating Tesla code\n",
+      "        155, #pragma acc loop gang /* blockIdx.x */\n",
+      "        156, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "    156, Loop is parallelizable\n",
+      "    190, Generating present(A[:])\n",
+      "         Generating Tesla code\n",
+      "        193, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "    203, Generating update self(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)])\n",
+      "    221, Generating exit data delete(rhs[:1],Aref[:1],A[:1],Anew[:1])\n"
      ]
     }
    ],
    "source": [
-    "checkdir('task2')\n",
+    "checkdir('task5')\n",
     "!make poisson2d.solution"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {},
+   "execution_count": 29,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" ./poisson2d.solution\n",
-      "Job <4703> is submitted to default queue <batch>.\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" ./poisson2d.solution\n",
+      "Job <25213> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
+      "WARN: IB HCA and GPU are not connected to a PCIe switch so IB performance can be limited depending on the CPU generation \n",
       "Jacobi relaxation Calculation: 4096 x 4096 mesh\n",
       "Calculate reference solution and time serial execution.\n",
       "    0, 0.250000\n",
@@ -1446,56 +2094,59 @@
       "  800, 0.249524\n",
       "  900, 0.249464\n",
       "Num GPUs: 2.\n",
-      "4096x4096: 1 GPU:   1.3294 s, 2 GPUs:   0.7305 s, speedup:     1.82, efficiency:    91.00%\n",
-      "MPI time:   0.0558 s, inter GPU BW:     2.19 GiB/s\n"
+      "4096x4096: 1 GPU:   1.3176 s, 2 GPUs:   0.6777 s, speedup:     1.94, efficiency:    97.22%\n"
      ]
     }
    ],
    "source": [
-    "checkdir('task2')\n",
+    "checkdir('task5')\n",
     "!NP=2 make run.solution"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
+   "execution_count": 30,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" pgprof -f --cpu-profiling off --annotate-mpi openmpi -o /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task2.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10\n",
-      "Job <4704> is submitted to default queue <batch>.\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task5.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10\n",
+      "Job <25214> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
-      "==33912== PGPROF is profiling process 33912, command: ./poisson2d.solution 10\n",
-      "==33913== PGPROF is profiling process 33913, command: ./poisson2d.solution 10\n",
-      "==33912== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task2.NP2.1.pgprof\n",
-      "==33913== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task2.NP2.0.pgprof\n",
+      "==94705== PGPROF is profiling process 94705, command: ./poisson2d.solution 10\n",
+      "==94707== PGPROF is profiling process 94707, command: ./poisson2d.solution 10\n",
+      "==94707== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task5.NP2.1.pgprof\n",
+      "WARN: IB HCA and GPU are not connected to a PCIe switch so IB performance can be limited depending on the CPU generation \n",
       "Jacobi relaxation Calculation: 4096 x 4096 mesh\n",
       "Calculate reference solution and time serial execution.\n",
       "    0, 0.250000\n",
       "Parallel execution.\n",
       "    0, 0.250000\n",
       "Num GPUs: 2.\n",
-      "4096x4096: 1 GPU:   0.0233 s, 2 GPUs:   0.0142 s, speedup:     1.64, efficiency:    82.17%\n",
-      "MPI time:   0.0008 s, inter GPU BW:     1.62 GiB/s\n",
-      "mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task2.NP2.?.pgprof  .\n",
-      "tar -cvzf pgprof.poisson2d.Task2.solution.tar.gz poisson2d.solution.Task2.NP2.?.pgprof\n",
-      "poisson2d.solution.Task2.NP2.0.pgprof\n",
-      "poisson2d.solution.Task2.NP2.1.pgprof\n"
+      "4096x4096: 1 GPU:   0.0225 s, 2 GPUs:   0.0117 s, speedup:     1.92, efficiency:    96.05%\n",
+      "==94705== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task5.NP2.0.pgprof\n",
+      "mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task5.NP2.?.pgprof  .\n",
+      "tar -cvzf pgprof.poisson2d.Task5.solution.tar.gz poisson2d.solution.Task5.NP2.?.pgprof\n",
+      "poisson2d.solution.Task5.NP2.0.pgprof\n",
+      "poisson2d.solution.Task5.NP2.1.pgprof\n"
      ]
     }
    ],
    "source": [
-    "checkdir('task2')\n",
+    "checkdir('task5')\n",
     "!NP=2 make profile.solution"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
     "#### Scaling\n",
     "\n",
@@ -1504,8 +2155,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {},
+   "execution_count": 31,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -1552,30 +2205,30 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>1</td>\n",
-       "      <td>1.4007</td>\n",
-       "      <td>0.94,</td>\n",
-       "      <td>94.02%</td>\n",
+       "      <td>1.2915</td>\n",
+       "      <td>1.02,</td>\n",
+       "      <td>101.63%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>2</td>\n",
-       "      <td>0.7482</td>\n",
-       "      <td>1.77,</td>\n",
-       "      <td>88.38%</td>\n",
+       "      <td>0.6742</td>\n",
+       "      <td>1.96,</td>\n",
+       "      <td>98.08%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>4</td>\n",
-       "      <td>0.4536</td>\n",
-       "      <td>2.94,</td>\n",
-       "      <td>73.56%</td>\n",
+       "      <td>0.3801</td>\n",
+       "      <td>3.47,</td>\n",
+       "      <td>86.66%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>6</td>\n",
-       "      <td>0.3480</td>\n",
-       "      <td>3.78,</td>\n",
-       "      <td>62.95%</td>\n",
+       "      <td>0.2733</td>\n",
+       "      <td>4.80,</td>\n",
+       "      <td>80.04%</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1583,35 +2236,41 @@
       ],
       "text/plain": [
        "   GPUs  time [s] speedup efficiency\n",
-       "0     1    1.4007   0.94,     94.02%\n",
-       "1     2    0.7482   1.77,     88.38%\n",
-       "2     4    0.4536   2.94,     73.56%\n",
-       "3     6    0.3480   3.78,     62.95%"
+       "0     1    1.2915   1.02,    101.63%\n",
+       "1     2    0.6742   1.96,     98.08%\n",
+       "2     4    0.3801   3.47,     86.66%\n",
+       "3     6    0.2733   4.80,     80.04%"
       ]
      },
-     "execution_count": 35,
+     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "checkdir('task2')\n",
+    "checkdir('task5')\n",
     "!NP=1 make run.solution | grep speedup > scale.out\n",
     "!NP=2 make run.solution | grep speedup >> scale.out\n",
     "!NP=4 make run.solution | grep speedup >>  scale.out\n",
     "!NP=6 make run.solution | grep speedup >>  scale.out\n",
-    "data_frameS2 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
+    "data_frameS5 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
     "\n",
     "!rm scale.out\n",
     "\n",
-    "data_frameS2b=data_frameS2.iloc[:,[5,7,10,12]].copy()\n",
-    "data_frameS2b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})\n"
+    "data_frameS5b=data_frameS5.iloc[:,[5,7,10,12]].copy()\n",
+    "data_frameS5b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
+    "The asynchronous execution and execution in the same stream can be seen in the profiler, e.g. as shown below.\n",
+    "\n",
+    "![Solution5.png](./resources/Solution5.png)\n",
+    "\n",
     "[Back to Top](#top)\n",
     "\n",
     "---"
@@ -1619,123 +2278,130 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "## Solution 3:<a name=\"solution3\"></a>\n",
-    "\n",
-    "\n",
-    "Update the boundaries first.\n",
-    "```C\n",
-    "#pragma acc parallel loop present(A,Anew)\n",
-    "for( int ix = ix_start; ix < ix_end; ix++ )\n",
-    "{\n",
-    "    A[(iy_start)*nx+ix] = Anew[(iy_start)*nx+ix];\n",
-    "    A[(iy_end-1)*nx+ix] = Anew[(iy_end-1)*nx+ix];\n",
-    "}\n",
-    "```\n",
-    "\n",
-    "Start the interior loop asynchronously so it can overlap with the MPI communication and wait at the end for the completion.\n",
-    "```C\n",
-    "#pragma acc parallel loop present(A,Anew) async\n",
-    "for (int iy = iy_start+1; iy < iy_end-1; iy++)\n",
-    "{\n",
-    "    for( int ix = ix_start; ix < ix_end; ix++ )\n",
-    "    {\n",
-    "        A[iy*nx+ix] = Anew[iy*nx+ix];\n",
-    "    }\n",
-    "}\n",
-    "\n",
-    "//Periodic boundary conditions\n",
-    "int top    = (rank == 0) ? (size-1) : rank-1;\n",
-    "int bottom = (rank == (size-1)) ? 0 : rank+1;\n",
-    "#pragma acc host_data use_device( A )\n",
-    "{\n",
-    "    double start_mpi = MPI_Wtime();\n",
-    "    //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom\n",
-    "    MPI_Sendrecv( A+iy_start*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,\n",
-    "                  A+iy_end*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,\n",
-    "                  MPI_COMM_WORLD, MPI_STATUS_IGNORE );\n",
-    "\n",
-    "    //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top\n",
-    "    MPI_Sendrecv( A+(iy_end-1)*nx+ix_start,   (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0,\n",
-    "                  A+(iy_start-1)*nx+ix_start, (ix_end-ix_start), MPI_REAL_TYPE, top   , 0,\n",
-    "                  MPI_COMM_WORLD, MPI_STATUS_IGNORE );\n",
-    "    mpi_time += MPI_Wtime() - start_mpi;\n",
-    "}\n",
-    "#pragma acc wait\n",
-    "```\n",
+    "## Solution 6:<a name=\"solution6\"></a> TODO\n",
     "\n",
     "\n",
     "\n",
     "#### Code\n",
     "\n",
-    "* [C Version](/edit/C/task3/poisson2d.solution.c)\n",
-    "* [Fortran Version](/edit/FORTRAN/task3/poisson2d.solution.F03)\n",
-    "\n",
-    "#### File browser\n",
-    "\n",
-    "Can be used to open source files, Makefiles, profiling output.\n",
-    "* [C Version](/tree/C/task3/)\n",
-    "* [Fortran Version](/tree/FORTRAN/task3/)"
+    "* [C Version](./C/task6/poisson2d.solution.c)\n",
+    "\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
     "#### Compiling, Running and Profiling\n",
     "\n",
-    "You can compile, run and profile the solution with the next cells.  __After__ the profiling finished the output file `poisson2d.solution.pgprof`  can be downloaded from here: [C Version](/tree/C/task3/pgprof.poisson2d.Task3.solution.tar.gz?download=1) / [Fortran Version](/tree/FORTRAN/task3/pgprof.poisson2d.Task3.solution.tar.gz?download=1).    "
+    "You can compile, run and profile the solution with the next cells.  You can profile the code by executing the next cell. __After__ the profiling completed download the tarball containing the profiles (`pgprof.Task6.solution.poisson2d.tar.gz`) with the File Browser. \n",
+    "Then you can import them into pgprof / nvvp using the _Import_ option in the _File_ menu. Remember to use the _Multiple processes_ option in the assistant.   "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {},
+   "execution_count": 32,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/autofs/nccsopen-svm1_home/mathiasw/sc17task/C/task3p\n"
+      "/autofs/nccsopen-svm1_home/mathiasw/sc19-tutorial-openpower/4-GPU/HandsOn/Solution/C/task6\n"
      ]
     }
    ],
    "source": [
-    "%cd $basedir/task3"
+    "%cd $basedir/task6"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
+   "execution_count": 33,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "make: `poisson2d.solution' is up to date.\r\n"
+      "mpicxx -c -DUSE_DOUBLE  -Minfo=accel -fast -acc -ta=tesla:cc70,pinned poisson2d_serial.c -o poisson2d_serial.o\n",
+      "poisson2d_serial(int, int, double, double *, double *, int, int, const double *):\n",
+      "     37, Generating present(Anew[:],rhs[:],Aref[:])\n",
+      "     39, Generating update device(rhs[:ny*nx],Aref[:ny*nx])\n",
+      "     40, Generating Tesla code\n",
+      "         43, #pragma acc loop gang /* blockIdx.x */\n",
+      "         44, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "         49, Generating implicit reduction(max:error)\n",
+      "     44, Loop is parallelizable\n",
+      "     51, Generating Tesla code\n",
+      "         54, #pragma acc loop gang /* blockIdx.x */\n",
+      "         55, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "     55, Loop is parallelizable\n",
+      "     58, Generating Tesla code\n",
+      "         62, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "     65, Generating Tesla code\n",
+      "         67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "     77, Generating update self(Aref[:ny*nx])\n",
+      "mpicxx -DUSE_DOUBLE  -Minfo=accel -fast -acc -ta=tesla:cc70,pinned -I/ccsopen/home/mathiasw/nvshmem-master/build/include poisson2d.solution.c poisson2d_serial.o -o poisson2d.solution -L/ccsopen/home/mathiasw/nvshmem-master/build/lib -lnvshmem -Mcuda -lcuda -lrt \n",
+      "poisson2d.solution.c:\n",
+      "main:\n",
+      "     95, Generating enter data create(Aref[:ny*nx],rhs[:ny*nx],A[:ny*nx],Anew[:ny*nx])\n",
+      "    106, Generating present(Aref[:],A[:])\n",
+      "         Generating Tesla code\n",
+      "        110, #pragma acc loop gang /* blockIdx.x */\n",
+      "        111, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "    111, Loop is parallelizable\n",
+      "    159, Generating update device(rhs[nx*iy_start:nx*(iy_end-iy_start)],A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)])\n",
+      "    160, Generating present(A[:],rhs[:],Anew[:])\n",
+      "         Generating Tesla code\n",
+      "        165, #pragma acc loop gang /* blockIdx.x */\n",
+      "        166, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "        170, Generating implicit reduction(max:error)\n",
+      "    166, Loop is parallelizable\n",
+      "    176, Generating present(Anew[:],A[:])\n",
+      "         Generating Tesla code\n",
+      "        179, #pragma acc loop gang /* blockIdx.x */\n",
+      "        181, #pragma acc loop vector(128) /* threadIdx.x */\n",
+      "    181, Loop is parallelizable\n",
+      "    192, Generating present(A[:])\n",
+      "         Generating Tesla code\n",
+      "        195, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */\n",
+      "    205, Generating update self(A[nx*(iy_start-1):nx*((iy_end-iy_start)+2)])\n",
+      "    224, Generating exit data delete(rhs[:1],Aref[:1],A[:1],Anew[:1])\n"
      ]
     }
    ],
    "source": [
-    "checkdir('task3')\n",
+    "checkdir('task6')\n",
     "!make poisson2d.solution"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
+   "execution_count": 34,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" ./poisson2d.solution\n",
-      "Job <4709> is submitted to default queue <batch>.\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" ./poisson2d.solution\n",
+      "Job <25219> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
+      "WARN: IB HCA and GPU are not connected to a PCIe switch so IB performance can be limited depending on the CPU generation \n",
       "Jacobi relaxation Calculation: 4096 x 4096 mesh\n",
       "Calculate reference solution and time serial execution.\n",
       "    0, 0.250000\n",
@@ -1760,56 +2426,61 @@
       "  800, 0.249524\n",
       "  900, 0.249464\n",
       "Num GPUs: 2.\n",
-      "4096x4096: 1 GPU:   1.3417 s, 2 GPUs:   0.7025 s, speedup:     1.91, efficiency:    95.50%\n",
-      "MPI time:   0.0658 s, inter GPU BW:     1.86 GiB/s\n"
+      "4096x4096: 1 GPU:   1.3157 s, 2 GPUs:   0.6533 s, speedup:     2.01, efficiency:   100.70%\n",
+      "MPI time:   0.0000 s, inter GPU BW:      inf GiB/s\n"
      ]
     }
    ],
    "source": [
-    "checkdir('task3')\n",
+    "checkdir('task6')\n",
     "!NP=2 make run.solution"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
+   "execution_count": 35,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "bsub -W 60 -P GEN110 -nnodes 1 -Is jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" pgprof -f --cpu-profiling off --annotate-mpi openmpi -o /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task3.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10\n",
-      "Job <4710> is submitted to default queue <batch>.\n",
+      "bsub -W 60 -nnodes 1 -Is -P TRN003 jsrun -n 1 -c 1 -g ALL_GPUS -a 2 -c ALL_CPUS -d cyclic -b packed:7 --smpiargs \"-gpu\" pgprof -f --cpu-profiling off --openmp-profiling off --annotate-mpi openmpi -o /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task6.NP2.%q{OMPI_COMM_WORLD_RANK}.pgprof ./poisson2d.solution 10\n",
+      "Job <25220> is submitted to default queue <batch>.\n",
       "<<Waiting for dispatch ...>>\n",
       "<<Starting on login1>>\n",
-      "==34328== PGPROF is profiling process 34328, command: ./poisson2d.solution 10\n",
-      "==34327== PGPROF is profiling process 34327, command: ./poisson2d.solution 10\n",
-      "==34328== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task3.NP2.1.pgprof\n",
+      "==95445== PGPROF is profiling process 95445, command: ./poisson2d.solution 10\n",
+      "==95446== PGPROF is profiling process 95446, command: ./poisson2d.solution 10\n",
+      "==95445== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task6.NP2.1.pgprof\n",
+      "WARN: IB HCA and GPU are not connected to a PCIe switch so IB performance can be limited depending on the CPU generation \n",
       "Jacobi relaxation Calculation: 4096 x 4096 mesh\n",
       "Calculate reference solution and time serial execution.\n",
       "    0, 0.250000\n",
       "Parallel execution.\n",
       "    0, 0.250000\n",
       "Num GPUs: 2.\n",
-      "4096x4096: 1 GPU:   0.0234 s, 2 GPUs:   0.0135 s, speedup:     1.74, efficiency:    86.82%\n",
-      "MPI time:   0.0009 s, inter GPU BW:     1.29 GiB/s\n",
-      "==34327== Generated result file: /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task3.NP2.0.pgprof\n",
-      "mv /gpfs/wolf/gen110/scratch/mathiasw//poisson2d.solution.Task3.NP2.?.pgprof  .\n",
-      "tar -cvzf pgprof.poisson2d.Task3.solution.tar.gz poisson2d.solution.Task3.NP2.?.pgprof\n",
-      "poisson2d.solution.Task3.NP2.0.pgprof\n",
-      "poisson2d.solution.Task3.NP2.1.pgprof\n"
+      "4096x4096: 1 GPU:   0.0225 s, 2 GPUs:   0.0116 s, speedup:     1.94, efficiency:    96.85%\n",
+      "MPI time:   0.0000 s, inter GPU BW:      inf GiB/s\n",
+      "==95446== Generated result file: /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task6.NP2.0.pgprof\n",
+      "mv /gpfs/wolf/trn003/scratch/mathiasw//poisson2d.solution.Task6.NP2.?.pgprof  .\n",
+      "tar -cvzf pgprof.poisson2d.Task6.solution.tar.gz poisson2d.solution.Task6.NP2.?.pgprof\n",
+      "poisson2d.solution.Task6.NP2.0.pgprof\n",
+      "poisson2d.solution.Task6.NP2.1.pgprof\n"
      ]
     }
    ],
    "source": [
-    "checkdir('task3')\n",
+    "checkdir('task6')\n",
     "!NP=2 make profile.solution"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
     "#### Scaling\n",
     "\n",
@@ -1818,8 +2489,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
+   "execution_count": 36,
+   "metadata": {
+    "exercise": "solution"
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -1866,30 +2539,30 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>1</td>\n",
-       "      <td>1.3711</td>\n",
-       "      <td>0.96,</td>\n",
-       "      <td>96.37%</td>\n",
+       "      <td>1.2869</td>\n",
+       "      <td>1.02,</td>\n",
+       "      <td>102.05%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>2</td>\n",
-       "      <td>0.7171</td>\n",
-       "      <td>1.86,</td>\n",
-       "      <td>92.90%</td>\n",
+       "      <td>0.6574</td>\n",
+       "      <td>1.99,</td>\n",
+       "      <td>99.26%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>4</td>\n",
-       "      <td>0.4104</td>\n",
-       "      <td>3.21,</td>\n",
-       "      <td>80.16%</td>\n",
+       "      <td>0.3670</td>\n",
+       "      <td>3.59,</td>\n",
+       "      <td>89.71%</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>6</td>\n",
-       "      <td>0.2966</td>\n",
-       "      <td>4.47,</td>\n",
-       "      <td>74.47%</td>\n",
+       "      <td>0.2450</td>\n",
+       "      <td>5.37,</td>\n",
+       "      <td>89.42%</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1897,46 +2570,43 @@
       ],
       "text/plain": [
        "   GPUs  time [s] speedup efficiency\n",
-       "0     1    1.3711   0.96,     96.37%\n",
-       "1     2    0.7171   1.86,     92.90%\n",
-       "2     4    0.4104   3.21,     80.16%\n",
-       "3     6    0.2966   4.47,     74.47%"
+       "0     1    1.2869   1.02,    102.05%\n",
+       "1     2    0.6574   1.99,     99.26%\n",
+       "2     4    0.3670   3.59,     89.71%\n",
+       "3     6    0.2450   5.37,     89.42%"
       ]
      },
-     "execution_count": 40,
+     "execution_count": 36,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "checkdir('task3')\n",
+    "checkdir('task6')\n",
     "!NP=1 make run.solution | grep speedup > scale.out\n",
     "!NP=2 make run.solution | grep speedup >> scale.out\n",
     "!NP=4 make run.solution | grep speedup >>  scale.out\n",
     "!NP=6 make run.solution | grep speedup >>  scale.out\n",
-    "data_frameS3 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
+    "data_frameS5 = pandas.read_csv('scale.out', delim_whitespace=True, header=None)\n",
     "\n",
     "!rm scale.out\n",
     "\n",
-    "data_frameS3b=data_frameS3.iloc[:,[5,7,10,12]].copy()\n",
-    "data_frameS3b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})"
+    "data_frameS5b=data_frameS5.iloc[:,[5,7,10,12]].copy()\n",
+    "data_frameS5b.rename(columns={5:'GPUs', 7: 'time [s]', 10:'speedup', 12:'efficiency'})"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "exercise": "solution"
+   },
    "source": [
-    "The overlap of compute and communication can be seen in the profiler, e.g. as shown below.\n",
+    "The missing of device copies can be seen in the profiler, e.g. as shown below.\n",
     "\n",
-    "![Solution3.png](Solution3.png)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
+    "![Solution6.png](./resources/Solution6.png)\n",
+    "\n",
+    "[Back to Top](#top)\n",
     "\n",
-    "---\n",
     "---"
    ]
   },
@@ -1944,21 +2614,18 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "---\n",
+    "\n",
     "# Survey<a name=\"survey\"></a>\n",
     "\n",
-    "Please rememeber to take some time and fill out the survey http://bit.ly/sc18-eval.\n",
-    "![eval.png](eval.png)"
+    "Please remember to take some time and fill out the surveyhttp://bit.ly/sc19-eval.\n",
+    "\n",
+    "![eval.png](./resources/eval.png)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
+  "celltoolbar": "Edit Metadata",
   "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
@@ -1974,7 +2641,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.7.0"
   },
   "toc": {
    "base_numbering": 1,
@@ -1991,5 +2658,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/4-GPU/HandsOn/Solution/HandsOnGPUProgramming_Solution.pdf b/4-GPU/HandsOn/Solution/HandsOnGPUProgramming_Solution.pdf
deleted file mode 100644
index e223c7e793ceb76a992a70c36fac1d1e6ea6f735..0000000000000000000000000000000000000000
Binary files a/4-GPU/HandsOn/Solution/HandsOnGPUProgramming_Solution.pdf and /dev/null differ
diff --git a/4-GPU/HandsOn/eval.png b/4-GPU/HandsOn/eval.png
deleted file mode 100644
index 4a4a1ea8bfcae18a4eba3650b0552d75dd7dc407..0000000000000000000000000000000000000000
Binary files a/4-GPU/HandsOn/eval.png and /dev/null differ