Skip to content
Snippets Groups Projects
Commit c1953817 authored by Chelsea Maria John's avatar Chelsea Maria John
Browse files

update error

parent 12cf5e1f
Branches
No related tags found
No related merge requests found
...@@ -28,6 +28,8 @@ module purge ...@@ -28,6 +28,8 @@ module purge
module load Stages/2023 module load Stages/2023
module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Systems/2023.2.1 MPI-settings/CUDA-UCC module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Systems/2023.2.1 MPI-settings/CUDA-UCC
# NCCL Version 2.15.1 # NCCL Version 2.15.1
# UCC Version=1.1.0
# UCX Version 1.13.1
# All variant have the following command line options # All variant have the following command line options
...@@ -48,40 +50,39 @@ srun ./jacobi -niter 10 -nx ${NXNY} -ny ${NXNY} 2>&1 | tee -a debug_log.txt ...@@ -48,40 +50,39 @@ srun ./jacobi -niter 10 -nx ${NXNY} -ny ${NXNY} 2>&1 | tee -a debug_log.txt
## Error ## Error
``` ```
Single GPU jacobi relaxation: 100 iterations on 20480 x 20480 mesh with norm check every 1 iterations Single GPU jacobi relaxation: 10 iterations on 20480 x 20480 mesh with norm check every 1 iterations
0, 35.776176 0, 35.776176
[jwb0085:22560:0:22560] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x147768000000) [jrc0438:2954 :0:2954] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x14a63a000000)
==== backtrace (tid: 22560) ==== ==== backtrace (tid: 2956) ====
0 0x000000000004eb80 killpg() ???:0 0 0x000000000004eb50 killpg() ???:0
1 0x0000000000221af5 cuEGLApiInit() ???:0 1 0x0000000000221af5 cuEGLApiInit() ???:0
2 0x0000000000238d90 cuEGLApiInit() ???:0 2 0x0000000000238d90 cuEGLApiInit() ???:0
3 0x0000000000238efd cuEGLApiInit() ???:0 3 0x0000000000238efd cuEGLApiInit() ???:0
4 0x000000000031deb5 cuMemMapArrayAsync() ???:0 4 0x000000000031deb5 cuMemMapArrayAsync() ???:0
5 0x000000000001d115 ???() /p/software/juwelsbooster/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0:0 5 0x000000000001d115 ???() /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0:0
6 0x000000000005be34 cudaGraphAddKernelNode() ???:0 6 0x000000000005be34 cudaGraphAddKernelNode() ???:0
7 0x0000000000405763 main() /p/project/cexalab/john2/task_graph/graph_wo_streams.cu:381 7 0x00000000004058db main() /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:414
8 0x000000000003ad85 __libc_start_main() ???:0 8 0x000000000003ad85 __libc_start_main() ???:0
9 0x000000000040360e _start() ???:0 9 0x000000000040360e _start() ???:0
================================= =================================
[jwb0085:22560] *** Process received signal *** [jrc0438:02956] *** Process received signal ***
[jwb0085:22560] Signal: Segmentation fault (11) [jrc0438:02956] Signal: Segmentation fault (11)
[jwb0085:22560] Signal code: (-6) [jrc0438:02956] Signal code: (-6)
[jwb0085:22560] Failing at address: 0x448100005820 [jrc0438:02956] Failing at address: 0x448100000b8c
[jwb0085:22560] [ 0] /usr/lib64/libc.so.6(+0x4eb80)[0x1479e2792b80] [jrc0438:02956] [ 0] /usr/lib64/libc.so.6(+0x4eb50)[0x14d3bb145b50]
[jwb0085:22560] [ 1] /p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/lib/libcuda.so.1(+0x221af5)[0x1479da608af5] [jrc0438:02956] [ 1] /usr/lib64/libcuda.so.1(+0x221af5)[0x14d3b2608af5]
[jwb0085:22560] [ 2] /p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/lib/libcuda.so.1(+0x238d90)[0x1479da61fd90] [jrc0438:02956] [ 2] /usr/lib64/libcuda.so.1(+0x238d90)[0x14d3b261fd90]
[jwb0085:22560] [ 3] /p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/lib/libcuda.so.1(+0x238efd)[0x1479da61fefd] [jrc0438:02956] [ 3] /usr/lib64/libcuda.so.1(+0x238efd)[0x14d3b261fefd]
[jwb0085:22560] [ 4] /p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/lib/libcuda.so.1(+0x31deb5)[0x1479da704eb5] [jrc0438:02956] [ 4] /usr/lib64/libcuda.so.1(+0x31deb5)[0x14d3b2704eb5]
[jwb0085:22560] [ 5] /p/software/juwelsbooster/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x1479e5582115] [jrc0438:02956] [ 5] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x14d3bfe26115]
[jwb0085:22560] [ 6] /p/software/juwelsbooster/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x1479e55c0e34] [jrc0438:02956] [ 6] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x14d3bfe64e34]
[jwb0085:22560] [ 7] ./jacobi[0x405763] [jrc0438:02956] [ 7] ./jacobi[0x4058db]
[jwb0085:22560] [ 8] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x1479e277ed85] [jrc0438:02956] [ 8] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x14d3bb131d85]
[jwb0085:22560] [ 9] ./jacobi[0x40360e] [jrc0438:02956] [ 9] ./jacobi[0x40360e]
[jwb0085:22560] *** End of error message *** [jrc0438:02956] *** End of error message ***
srun: error: jwb0085: task 0: Segmentation fault
``` ```
## Compute-sanitizer ## Part of Compute-sanitizer log
``` ```
========= COMPUTE-SANITIZER ========= COMPUTE-SANITIZER
...@@ -111,10 +112,66 @@ srun: error: jwb0085: task 0: Segmentation fault ...@@ -111,10 +112,66 @@ srun: error: jwb0085: task 0: Segmentation fault
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40 ========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
========= Host Frame:MPI_Init [0x7af8e] ========= Host Frame:MPI_Init [0x7af8e]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40 ========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
========= Host Frame:/p/project/cexalab/john2/task_graph/graph_wo_streams.cu:177:main [0x3782] ========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:177:main [0x3782]
========= in /p/project/cexalab/john2/task_graph/./jacobi ========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
========= Host Frame:__libc_start_main [0x3ad85] ========= Host Frame:__libc_start_main [0x3ad85]
========= in /usr/lib64/libc.so.6 ========= in /usr/lib64/libc.so.6
========= Host Frame:_start [0x360e] ========= Host Frame:_start [0x360e]
========= in /p/project/cexalab/john2/task_graph/./jacobi ========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
........
========= Invalid __global__ write of size 8 bytes
========= at 0x8e10 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *)
========= by thread (240,0,0) in block (0,0,0)
========= Address 0x154f5fa00000 is out of bounds
========= and is 50,331,648 bytes before the nearest allocation at 0x154f62a00000 of size 6,291,456 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x319c12]
========= in /usr/lib64/libcuda.so.1
========= Host Frame:__cudart808 [0xdea9b]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:cudaLaunchKernel [0x13a238]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
========= Host Frame:__libc_start_main [0x3ad85]
========= in /usr/lib64/libc.so.6
========= Host Frame:_start [0x360e]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
=========
========= Invalid __global__ write of size 16 bytes
========= at 0x70f0 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *)
========= by thread (320,0,0) in block (1,0,0)
========= Address 0x154f5d001000 is out of bounds
========= and is 6,295,553 bytes after the nearest allocation at 0x154f5c400000 of size 6,291,456 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x319c12]
========= in /usr/lib64/libcuda.so.1
========= Host Frame:__cudart808 [0xdea9b]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:cudaLaunchKernel [0x13a238]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
========= Host Frame:__libc_start_main [0x3ad85]
========= in /usr/lib64/libc.so.6
========= Host Frame:_start [0x360e]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
=========
``` ```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment