diff --git a/README.md b/README.md
index 5bf5f55e19850c7f162c550d3aff3fc103823078..879e66f9c987907e89df32eb565e16861ae0f4e4 100644
--- a/README.md
+++ b/README.md
@@ -26,10 +26,12 @@ Modify `NCCL`,`MPICXX` and `CUDA` paths in `Makefile` for running on different s
 
 module purge 
 module load Stages/2023 
-module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Systems/2023.2.1 MPI-settings/CUDA-UCC
-# NCCL Version 2.15.1
+module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 MPI-settings/CUDA-UCC 
+module load Nsight-Systems/2023.2.1 Valgrind/3.19.0
 # UCC Version=1.1.0 
 # UCX Version 1.13.1
+# NCCL Version 2.15.1
+# Valgrind Version 3.19.0
 
 
 # All variant have the following command line options
@@ -43,17 +45,17 @@ module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Sys
 NXNY="20480"
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 
-srun  ./jacobi  -niter 10 -nx ${NXNY} -ny ${NXNY} 2>&1 | tee -a debug_log.txt
+srun  valgrind --tool=memcheck --verbose --time-stamp=yes \
+    --log-file=valgrind.err  --leak-check=summary --track-origins=yes \
+    ./jacobi  -niter 10 -nx ${NXNY} -ny ${NXNY} 
 
 ```
 
 ## Error
 
 ```
-Single GPU jacobi relaxation: 10 iterations on 20480 x 20480 mesh with norm check every 1 iterations
-    0, 35.776176
-[jrc0438:2954 :0:2954] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x14a63a000000)
-==== backtrace (tid:   2956) ====
+[jrc0437:20114:0:20114] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x1b2000000)
+==== backtrace (tid:  20112) ====
  0 0x000000000004eb50 killpg()  ???:0
  1 0x0000000000221af5 cuEGLApiInit()  ???:0
  2 0x0000000000238d90 cuEGLApiInit()  ???:0
@@ -61,117 +63,25 @@ Single GPU jacobi relaxation: 10 iterations on 20480 x 20480 mesh with norm chec
  4 0x000000000031deb5 cuMemMapArrayAsync()  ???:0
  5 0x000000000001d115 ???()  /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0:0
  6 0x000000000005be34 cudaGraphAddKernelNode()  ???:0
- 7 0x00000000004058db main()  /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:414
+ 7 0x0000000000405947 main()  /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:423
  8 0x000000000003ad85 __libc_start_main()  ???:0
  9 0x000000000040360e _start()  ???:0
 =================================
-[jrc0438:02956] *** Process received signal ***
-[jrc0438:02956] Signal: Segmentation fault (11)
-[jrc0438:02956] Signal code:  (-6)
-[jrc0438:02956] Failing at address: 0x448100000b8c
-[jrc0438:02956] [ 0] /usr/lib64/libc.so.6(+0x4eb50)[0x14d3bb145b50]
-[jrc0438:02956] [ 1] /usr/lib64/libcuda.so.1(+0x221af5)[0x14d3b2608af5]
-[jrc0438:02956] [ 2] /usr/lib64/libcuda.so.1(+0x238d90)[0x14d3b261fd90]
-[jrc0438:02956] [ 3] /usr/lib64/libcuda.so.1(+0x238efd)[0x14d3b261fefd]
-[jrc0438:02956] [ 4] /usr/lib64/libcuda.so.1(+0x31deb5)[0x14d3b2704eb5]
-[jrc0438:02956] [ 5] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x14d3bfe26115]
-[jrc0438:02956] [ 6] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x14d3bfe64e34]
-[jrc0438:02956] [ 7] ./jacobi[0x4058db]
-[jrc0438:02956] [ 8] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x14d3bb131d85]
-[jrc0438:02956] [ 9] ./jacobi[0x40360e]
-[jrc0438:02956] *** End of error message ***
-```
-
-## Part of Compute-sanitizer log
-
-```
-========= COMPUTE-SANITIZER
-========= Program hit CUDA_ERROR_INVALID_CONTEXT (error 201) due to "invalid device context" on CUDA API call to cuCtxGetDevice.
-=========     Saved host backtrace up to driver entry point at error
-=========     Host Frame: [0x2b8c71]
-=========                in /usr/lib64/libcuda.so.1
-=========     Host Frame:base/cuda_iface.c:22:uct_cuda_base_query_devices_common [0x6c45]
-=========                in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/ucx/libuct_cuda.so.0
-=========     Host Frame:base/uct_md.c:115:uct_md_query_tl_resources [0x134f6]
-=========                in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libuct.so.0
-=========     Host Frame:core/ucp_context.c:1332:ucp_add_component_resources [0x205dc]
-=========                in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libucp.so.0
-=========     Host Frame:core/ucp_context.c:1470:ucp_fill_resources [0x2144f]
-=========                in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libucp.so.0
-=========     Host Frame:core/ucp_context.c:1886:ucp_init_version [0x2295f]
-=========                in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libucp.so.0
-=========     Host Frame:mca_pml_ucx_open [0x7402]
-=========                in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/openmpi/mca_pml_ucx.so
-=========     Host Frame:mca_base_framework_components_open [0x50614]
-=========                in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libopen-pal.so.40
-=========     Host Frame:mca_pml_base_open [0xcce4f]
-=========                in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
-=========     Host Frame:mca_base_framework_open [0x59a24]
-=========                in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libopen-pal.so.40
-=========     Host Frame:ompi_mpi_init [0xdcb84]
-=========                in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
-=========     Host Frame:MPI_Init [0x7af8e]
-=========                in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
-=========     Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:177:main [0x3782]
-=========                in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
-=========     Host Frame:__libc_start_main [0x3ad85]
-=========                in /usr/lib64/libc.so.6
-=========     Host Frame:_start [0x360e]
-=========                in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
-
-........
-========= Invalid __global__ write of size 8 bytes
-=========     at 0x8e10 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *)
-=========     by thread (240,0,0) in block (0,0,0)
-=========     Address 0x154f5fa00000 is out of bounds
-=========     and is 50,331,648 bytes before the nearest allocation at 0x154f62a00000 of size 6,291,456 bytes
-=========     Saved host backtrace up to driver entry point at kernel launch time
-=========     Host Frame: [0x319c12]
-=========                in /usr/lib64/libcuda.so.1
-=========     Host Frame:__cudart808 [0xdea9b]
-=========                in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
-=========     Host Frame:cudaLaunchKernel [0x13a238]
-=========                in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
-=========     Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d]
-=========                in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
-=========     Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f]
-=========                in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
-=========     Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8]
-=========                in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
-=========     Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179]
-=========                in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
-=========     Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3]
-=========                in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
-=========     Host Frame:__libc_start_main [0x3ad85]
-=========                in /usr/lib64/libc.so.6
-=========     Host Frame:_start [0x360e]
-=========                in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
-========= 
-========= Invalid __global__ write of size 16 bytes
-=========     at 0x70f0 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *)
-=========     by thread (320,0,0) in block (1,0,0)
-=========     Address 0x154f5d001000 is out of bounds
-=========     and is 6,295,553 bytes after the nearest allocation at 0x154f5c400000 of size 6,291,456 bytes
-=========     Saved host backtrace up to driver entry point at kernel launch time
-=========     Host Frame: [0x319c12]
-=========                in /usr/lib64/libcuda.so.1
-=========     Host Frame:__cudart808 [0xdea9b]
-=========                in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
-=========     Host Frame:cudaLaunchKernel [0x13a238]
-=========                in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
-=========     Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d]
-=========                in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
-=========     Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f]
-=========                in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
-=========     Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8]
-=========                in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
-=========     Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179]
-=========                in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
-=========     Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3]
-=========                in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
-=========     Host Frame:__libc_start_main [0x3ad85]
-=========                in /usr/lib64/libc.so.6
-=========     Host Frame:_start [0x360e]
-=========                in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
-========= 
+[jrc0437:20112] *** Process received signal ***
+[jrc0437:20112] Signal: Segmentation fault (11)
+[jrc0437:20112] Signal code:  (-6)
+[jrc0437:20112] Failing at address: 0x448100004e90
+[jrc0437:20112] [ 0] /usr/lib64/libc.so.6(+0x4eb50)[0x9671b50]
+[jrc0437:20112] [ 1] /usr/lib64/libc.so.6(gsignal+0xed)[0x9671aad]
+[jrc0437:20112] [ 2] /usr/lib64/libc.so.6(+0x4eb50)[0x9671b50]
+[jrc0437:20112] [ 3] /usr/lib64/libcuda.so.1(+0x221af5)[0xc223af5]
+[jrc0437:20112] [ 4] /usr/lib64/libcuda.so.1(+0x238d90)[0xc23ad90]
+[jrc0437:20112] [ 5] /usr/lib64/libcuda.so.1(+0x238efd)[0xc23aefd]
+[jrc0437:20112] [ 6] /usr/lib64/libcuda.so.1(+0x31deb5)[0xc31feb5]
+[jrc0437:20112] [ 7] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x4a4e115]
+[jrc0437:20112] [ 8] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x4a8ce34]
+[jrc0437:20112] [ 9] ./jacobi[0x405947]
+[jrc0437:20112] [10] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x965dd85]
+[jrc0437:20112] [11] ./jacobi[0x40360e]
+[jrc0437:20112] *** End of error message ***
 ```
diff --git a/graph_wo_streams.cu b/graph_wo_streams.cu
index d8f62615444305e87d3d6ff5713b54eda5d7fb08..26025967a2b68fac517604c1407f03e8dab74c46 100644
--- a/graph_wo_streams.cu
+++ b/graph_wo_streams.cu
@@ -404,18 +404,32 @@ int main(int argc, char* argv[]) {
             nodeDependencies.push_back(jacobiMemsetNode); //manage dependency vector
             cudaDeviceSynchronize();
 
-            void *kernelArgsNode1[7] = {(void *)a_new, (void *)a, (void *)l2_norm_d, (void *)&new_iy_start, (void *)&new_iy_start_neigh , (void *)&new_nx, (void *)&new_calculate_norm};
+            void *kernelArgsNode1[7] = {
+                                        (void *)a_new, (void *)a, 
+                                        (void *)l2_norm_d, (void *)&new_iy_start, 
+                                        (void *)&new_iy_start_neigh, (void *)&new_nx, 
+                                        (void *)&new_calculate_norm, 
+                                        };
             jacobiKernelNode1Params.func = (void *)jacobi_kernel_proxy;
             jacobiKernelNode1Params.blockDim = dim3(dim_block_x, dim_block_y, 1);
             jacobiKernelNode1Params.gridDim = dim3((nx + dim_block_x - 1) / dim_block_x,((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
             jacobiKernelNode1Params.sharedMemBytes = 0;
             jacobiKernelNode1Params.kernelParams = kernelArgsNode1;
             jacobiKernelNode1Params.extra = NULL;
+            for (std::size_t i = 0; i < 8; i++)
+            {
+                fprintf(stderr,"Kernel Arg %d: %p\n", i, kernelArgsNode1[i]);
+            }
             CUDA_RT_CALL(cudaGraphAddKernelNode(&jacobiKernelNode1, graphs[calculate_norm][is_even], nodeDependencies.data(), 
                                     nodeDependencies.size(),&jacobiKernelNode1Params));
      
 
-            void *kernelArgsNode2[7] = {(void *)a_new, (void *)a, (void *)l2_norm_d, (void *)&new_iy_end_neigh, (void *)&new_iy_end, (void *)&new_nx, (void *)&new_calculate_norm};
+            void *kernelArgsNode2[7] = {
+                                        (void *)a_new, (void *)a,
+                                        (void *)l2_norm_d, (void *)&new_iy_end_neigh,
+                                        (void *)&new_iy_end, (void *)&new_nx,
+                                        (void *)&new_calculate_norm
+                                        };
             jacobiKernelNode2Params.func = (void *)jacobi_kernel_proxy;
             jacobiKernelNode2Params.blockDim = dim3(dim_block_x, dim_block_y, 1);
             jacobiKernelNode2Params.gridDim = dim3((nx + dim_block_x - 1) / dim_block_x,((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
@@ -471,8 +485,12 @@ int main(int argc, char* argv[]) {
             nodeDependencies.push_back(jacobiMemsetNode);
             nodeDependencies.push_back(ncclNode);
 
-            void *kernelArgsNode3[7] = {(void *)&a_new, (void *)&a,(void *)&l2_norm_d, (void *)&new_iy_start_neigh,(void *)&new_iy_end_neigh,
-                                         (void *)&new_nx,(void *)&new_calculate_norm};
+            void *kernelArgsNode3[7] = {
+                                        (void *)&a_new, (void *)&a,
+                                        (void *)&l2_norm_d, (void *)&new_iy_start_neigh,
+                                        (void *)&new_iy_end_neigh,(void *)&new_nx,
+                                        (void *)&new_calculate_norm
+                                        };
             jacobiKernelNode3Params.func = (void *)jacobi_kernel_proxy;
             jacobiKernelNode3Params.blockDim = dim3(dim_block_x, dim_block_y, 1);
             jacobiKernelNode3Params.gridDim = dim3((nx + dim_block_x - 1) / dim_block_x,((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);