diff --git a/README.md b/README.md index c0f06ce9e9f49d7e534b65a161ad99c558ee1b19..5bf5f55e19850c7f162c550d3aff3fc103823078 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,8 @@ module purge module load Stages/2023 module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Systems/2023.2.1 MPI-settings/CUDA-UCC # NCCL Version 2.15.1 +# UCC Version=1.1.0 +# UCX Version 1.13.1 # All variant have the following command line options @@ -48,40 +50,39 @@ srun ./jacobi -niter 10 -nx ${NXNY} -ny ${NXNY} 2>&1 | tee -a debug_log.txt ## Error ``` -Single GPU jacobi relaxation: 100 iterations on 20480 x 20480 mesh with norm check every 1 iterations +Single GPU jacobi relaxation: 10 iterations on 20480 x 20480 mesh with norm check every 1 iterations 0, 35.776176 -[jwb0085:22560:0:22560] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x147768000000) -==== backtrace (tid: 22560) ==== - 0 0x000000000004eb80 killpg() ???:0 +[jrc0438:2954 :0:2954] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x14a63a000000) +==== backtrace (tid: 2956) ==== + 0 0x000000000004eb50 killpg() ???:0 1 0x0000000000221af5 cuEGLApiInit() ???:0 2 0x0000000000238d90 cuEGLApiInit() ???:0 3 0x0000000000238efd cuEGLApiInit() ???:0 4 0x000000000031deb5 cuMemMapArrayAsync() ???:0 - 5 0x000000000001d115 ???() /p/software/juwelsbooster/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0:0 + 5 0x000000000001d115 ???() /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0:0 6 0x000000000005be34 cudaGraphAddKernelNode() ???:0 - 7 0x0000000000405763 main() /p/project/cexalab/john2/task_graph/graph_wo_streams.cu:381 + 7 0x00000000004058db main() /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:414 8 0x000000000003ad85 __libc_start_main() ???:0 9 0x000000000040360e _start() ???:0 ================================= -[jwb0085:22560] *** Process received signal *** -[jwb0085:22560] Signal: Segmentation fault (11) -[jwb0085:22560] Signal code: (-6) -[jwb0085:22560] Failing at address: 0x448100005820 -[jwb0085:22560] [ 0] /usr/lib64/libc.so.6(+0x4eb80)[0x1479e2792b80] -[jwb0085:22560] [ 1] /p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/lib/libcuda.so.1(+0x221af5)[0x1479da608af5] -[jwb0085:22560] [ 2] /p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/lib/libcuda.so.1(+0x238d90)[0x1479da61fd90] -[jwb0085:22560] [ 3] /p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/lib/libcuda.so.1(+0x238efd)[0x1479da61fefd] -[jwb0085:22560] [ 4] /p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/lib/libcuda.so.1(+0x31deb5)[0x1479da704eb5] -[jwb0085:22560] [ 5] /p/software/juwelsbooster/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x1479e5582115] -[jwb0085:22560] [ 6] /p/software/juwelsbooster/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x1479e55c0e34] -[jwb0085:22560] [ 7] ./jacobi[0x405763] -[jwb0085:22560] [ 8] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x1479e277ed85] -[jwb0085:22560] [ 9] ./jacobi[0x40360e] -[jwb0085:22560] *** End of error message *** -srun: error: jwb0085: task 0: Segmentation fault +[jrc0438:02956] *** Process received signal *** +[jrc0438:02956] Signal: Segmentation fault (11) +[jrc0438:02956] Signal code: (-6) +[jrc0438:02956] Failing at address: 0x448100000b8c +[jrc0438:02956] [ 0] /usr/lib64/libc.so.6(+0x4eb50)[0x14d3bb145b50] +[jrc0438:02956] [ 1] /usr/lib64/libcuda.so.1(+0x221af5)[0x14d3b2608af5] +[jrc0438:02956] [ 2] /usr/lib64/libcuda.so.1(+0x238d90)[0x14d3b261fd90] +[jrc0438:02956] [ 3] /usr/lib64/libcuda.so.1(+0x238efd)[0x14d3b261fefd] +[jrc0438:02956] [ 4] /usr/lib64/libcuda.so.1(+0x31deb5)[0x14d3b2704eb5] +[jrc0438:02956] [ 5] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x14d3bfe26115] +[jrc0438:02956] [ 6] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x14d3bfe64e34] +[jrc0438:02956] [ 7] ./jacobi[0x4058db] +[jrc0438:02956] [ 8] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x14d3bb131d85] +[jrc0438:02956] [ 9] ./jacobi[0x40360e] +[jrc0438:02956] *** End of error message *** ``` -## Compute-sanitizer +## Part of Compute-sanitizer log ``` ========= COMPUTE-SANITIZER @@ -111,10 +112,66 @@ srun: error: jwb0085: task 0: Segmentation fault ========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40 ========= Host Frame:MPI_Init [0x7af8e] ========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40 -========= Host Frame:/p/project/cexalab/john2/task_graph/graph_wo_streams.cu:177:main [0x3782] -========= in /p/project/cexalab/john2/task_graph/./jacobi +========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:177:main [0x3782] +========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi ========= Host Frame:__libc_start_main [0x3ad85] ========= in /usr/lib64/libc.so.6 ========= Host Frame:_start [0x360e] -========= in /p/project/cexalab/john2/task_graph/./jacobi +========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi + +........ +========= Invalid __global__ write of size 8 bytes +========= at 0x8e10 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *) +========= by thread (240,0,0) in block (0,0,0) +========= Address 0x154f5fa00000 is out of bounds +========= and is 50,331,648 bytes before the nearest allocation at 0x154f62a00000 of size 6,291,456 bytes +========= Saved host backtrace up to driver entry point at kernel launch time +========= Host Frame: [0x319c12] +========= in /usr/lib64/libcuda.so.1 +========= Host Frame:__cudart808 [0xdea9b] +========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 +========= Host Frame:cudaLaunchKernel [0x13a238] +========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 +========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d] +========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 +========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f] +========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 +========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8] +========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 +========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179] +========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 +========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3] +========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi +========= Host Frame:__libc_start_main [0x3ad85] +========= in /usr/lib64/libc.so.6 +========= Host Frame:_start [0x360e] +========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi +========= +========= Invalid __global__ write of size 16 bytes +========= at 0x70f0 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *) +========= by thread (320,0,0) in block (1,0,0) +========= Address 0x154f5d001000 is out of bounds +========= and is 6,295,553 bytes after the nearest allocation at 0x154f5c400000 of size 6,291,456 bytes +========= Saved host backtrace up to driver entry point at kernel launch time +========= Host Frame: [0x319c12] +========= in /usr/lib64/libcuda.so.1 +========= Host Frame:__cudart808 [0xdea9b] +========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 +========= Host Frame:cudaLaunchKernel [0x13a238] +========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 +========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d] +========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 +========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f] +========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 +========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8] +========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 +========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179] +========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 +========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3] +========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi +========= Host Frame:__libc_start_main [0x3ad85] +========= in /usr/lib64/libc.so.6 +========= Host Frame:_start [0x360e] +========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi +========= ```