From 7b500ac12082127709553bdd12c85a965646d144 Mon Sep 17 00:00:00 2001
From: Mathias Wagner <mathiasw@nvidia.com>
Date: Sat, 16 Nov 2019 09:31:29 -0700
Subject: [PATCH] update for solution 6

---
 .../HandsOnGPUProgramming_master.ipynb        | 31 ++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/4-GPU/HandsOn/.master/HandsOnGPUProgramming_master.ipynb b/4-GPU/HandsOn/.master/HandsOnGPUProgramming_master.ipynb
index 5d09371..4e5bfe9 100644
--- a/4-GPU/HandsOn/.master/HandsOnGPUProgramming_master.ipynb
+++ b/4-GPU/HandsOn/.master/HandsOnGPUProgramming_master.ipynb
@@ -4150,6 +4150,7 @@
     "#include <nvshmem.h>\n",
     "#include <nvshmemx.h>\n",
     "```\n",
+    "\n",
     "and initialize NVSHMEM with MPI\n",
     "```C\n",
     "MPI_Comm mpi_comm = MPI_COMM_WORLD;\n",
@@ -4850,6 +4851,34 @@
     "## Solution 6:<a name=\"solution6\"></a>\n",
     "\n",
     "\n",
+    "The most important part here is to get an `nvshmem_ptr` pointing to the symmetric `d_A` allocation of your top and bottom neighbor.\n",
+    "```C\n",
+    "real * restrict d_Atop = (real *)nvshmem_ptr(d_A, top);\n",
+    "real * restrict d_Abottom = (real *)nvshmem_ptr(d_A, bottom);\n",
+    "```\n",
+    "\n",
+    "When updating `A` from Anew make sure to also update `A` on your top and bottom neighbor if you are at the boundary:\n",
+    "```C\n",
+    "#pragma acc parallel loop present(A, Anew) deviceptr(d_Atop, d_Abottom) async\n",
+    "for (int iy = iy_start; iy < iy_end; iy++) {\n",
+    "    for (int ix = ix_start; ix < ix_end; ix++) {\n",
+    "        A[iy * nx + ix] = Anew[iy * nx + ix];\n",
+    "        if(iy == iy_start){// this also needs to go to the lower halo region of my upper neighbor\n",
+    "            d_Atop[iy_end_top * nx + ix] = Anew[iy * nx + ix];\n",
+    "        }\n",
+    "        if(iy == iy_end -1){// this also needs to go to the upper halo region of my bottom neighbor\n",
+    "            d_Abottom[(iy_start_bottom - 1) * nx + ix]  =  Anew[iy * nx + ix];\n",
+    "        }\n",
+    "    }\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "We can then remove the explicit `nvhsmem_put` calls on completely. But remember to still keep the barrier.\n",
+    "```C\n",
+    "nvshmemx_barrier_all_on_stream((cudaStream_t)acc_get_cuda_stream(acc_get_default_async()));\n",
+    "````\n",
+    "\n",
+    "\n",
     "\n",
     "#### Code\n",
     "\n",
@@ -5148,7 +5177,7 @@
     "exercise": "solution"
    },
    "source": [
-    "The missing of device copies can be seen in the profiler, e.g. as shown below.\n",
+    "The missing of device copies can be seen in the profiler, e.g. as shown below. There are only kernels running mostly back-to-back, only interrupted by the global reduction.\n",
     "\n",
     "![Solution6.png](./resources/Solution6.png)\n",
     "\n",
-- 
GitLab