Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
CUDA-NCCL TaskGraph
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Chelsea Maria John
CUDA-NCCL TaskGraph
Commits
c1953817
Commit
c1953817
authored
1 year ago
by
Chelsea Maria John
Browse files
Options
Downloads
Patches
Plain Diff
update error
parent
12cf5e1f
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
README.md
+83
-26
83 additions, 26 deletions
README.md
with
83 additions
and
26 deletions
README.md
+
83
−
26
View file @
c1953817
...
@@ -28,6 +28,8 @@ module purge
...
@@ -28,6 +28,8 @@ module purge
module load Stages/2023
module load Stages/2023
module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Systems/2023.2.1 MPI-settings/CUDA-UCC
module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Systems/2023.2.1 MPI-settings/CUDA-UCC
# NCCL Version 2.15.1
# NCCL Version 2.15.1
# UCC Version=1.1.0
# UCX Version 1.13.1
# All variant have the following command line options
# All variant have the following command line options
...
@@ -48,40 +50,39 @@ srun ./jacobi -niter 10 -nx ${NXNY} -ny ${NXNY} 2>&1 | tee -a debug_log.txt
...
@@ -48,40 +50,39 @@ srun ./jacobi -niter 10 -nx ${NXNY} -ny ${NXNY} 2>&1 | tee -a debug_log.txt
## Error
## Error
```
```
Single GPU jacobi relaxation: 10
0
iterations on 20480 x 20480 mesh with norm check every 1 iterations
Single GPU jacobi relaxation: 10 iterations on 20480 x 20480 mesh with norm check every 1 iterations
0, 35.776176
0, 35.776176
[j
wb0085:22560:0:22560
] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x14
7768
000000)
[j
rc0438:2954 :0:2954
] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x14
a63a
000000)
==== backtrace (tid:
22
56
0
) ====
==== backtrace (tid:
29
56) ====
0 0x000000000004eb
8
0 killpg() ???:0
0 0x000000000004eb
5
0 killpg() ???:0
1 0x0000000000221af5 cuEGLApiInit() ???:0
1 0x0000000000221af5 cuEGLApiInit() ???:0
2 0x0000000000238d90 cuEGLApiInit() ???:0
2 0x0000000000238d90 cuEGLApiInit() ???:0
3 0x0000000000238efd cuEGLApiInit() ???:0
3 0x0000000000238efd cuEGLApiInit() ???:0
4 0x000000000031deb5 cuMemMapArrayAsync() ???:0
4 0x000000000031deb5 cuMemMapArrayAsync() ???:0
5 0x000000000001d115 ???() /p/software/ju
welsbooster
/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0:0
5 0x000000000001d115 ???() /p/software/ju
recadc
/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0:0
6 0x000000000005be34 cudaGraphAddKernelNode() ???:0
6 0x000000000005be34 cudaGraphAddKernelNode() ???:0
7 0x0000000000405
763
main() /p/project/cexalab/john2/task_graph/graph_wo_streams.cu:
381
7 0x0000000000405
8db
main() /p/project/cexalab/john2/task_graph/
cuda-nccl-taskgraph/
graph_wo_streams.cu:
414
8 0x000000000003ad85 __libc_start_main() ???:0
8 0x000000000003ad85 __libc_start_main() ???:0
9 0x000000000040360e _start() ???:0
9 0x000000000040360e _start() ???:0
=================================
=================================
[jwb0085:22560] *** Process received signal ***
[jrc0438:02956] *** Process received signal ***
[jwb0085:22560] Signal: Segmentation fault (11)
[jrc0438:02956] Signal: Segmentation fault (11)
[jwb0085:22560] Signal code: (-6)
[jrc0438:02956] Signal code: (-6)
[jwb0085:22560] Failing at address: 0x448100005820
[jrc0438:02956] Failing at address: 0x448100000b8c
[jwb0085:22560] [ 0] /usr/lib64/libc.so.6(+0x4eb80)[0x1479e2792b80]
[jrc0438:02956] [ 0] /usr/lib64/libc.so.6(+0x4eb50)[0x14d3bb145b50]
[jwb0085:22560] [ 1] /p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/lib/libcuda.so.1(+0x221af5)[0x1479da608af5]
[jrc0438:02956] [ 1] /usr/lib64/libcuda.so.1(+0x221af5)[0x14d3b2608af5]
[jwb0085:22560] [ 2] /p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/lib/libcuda.so.1(+0x238d90)[0x1479da61fd90]
[jrc0438:02956] [ 2] /usr/lib64/libcuda.so.1(+0x238d90)[0x14d3b261fd90]
[jwb0085:22560] [ 3] /p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/lib/libcuda.so.1(+0x238efd)[0x1479da61fefd]
[jrc0438:02956] [ 3] /usr/lib64/libcuda.so.1(+0x238efd)[0x14d3b261fefd]
[jwb0085:22560] [ 4] /p/software/juwelsbooster/stages/2023/software/nvidia-driver/default/lib/libcuda.so.1(+0x31deb5)[0x1479da704eb5]
[jrc0438:02956] [ 4] /usr/lib64/libcuda.so.1(+0x31deb5)[0x14d3b2704eb5]
[jwb0085:22560] [ 5] /p/software/juwelsbooster/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x1479e5582115]
[jrc0438:02956] [ 5] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x14d3bfe26115]
[jwb0085:22560] [ 6] /p/software/juwelsbooster/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x1479e55c0e34]
[jrc0438:02956] [ 6] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x14d3bfe64e34]
[jwb0085:22560] [ 7] ./jacobi[0x405763]
[jrc0438:02956] [ 7] ./jacobi[0x4058db]
[jwb0085:22560] [ 8] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x1479e277ed85]
[jrc0438:02956] [ 8] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x14d3bb131d85]
[jwb0085:22560] [ 9] ./jacobi[0x40360e]
[jrc0438:02956] [ 9] ./jacobi[0x40360e]
[jwb0085:22560] *** End of error message ***
[jrc0438:02956] *** End of error message ***
srun: error: jwb0085: task 0: Segmentation fault
```
```
## Compute-sanitizer
##
Part of
Compute-sanitizer
log
```
```
========= COMPUTE-SANITIZER
========= COMPUTE-SANITIZER
...
@@ -111,10 +112,66 @@ srun: error: jwb0085: task 0: Segmentation fault
...
@@ -111,10 +112,66 @@ srun: error: jwb0085: task 0: Segmentation fault
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
========= Host Frame:MPI_Init [0x7af8e]
========= Host Frame:MPI_Init [0x7af8e]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
========= Host Frame:/p/project/cexalab/john2/task_graph/graph_wo_streams.cu:177:main [0x3782]
========= Host Frame:/p/project/cexalab/john2/task_graph/
cuda-nccl-taskgraph/
graph_wo_streams.cu:177:main [0x3782]
========= in /p/project/cexalab/john2/task_graph/./jacobi
========= in /p/project/cexalab/john2/task_graph/
cuda-nccl-taskgraph/
./jacobi
========= Host Frame:__libc_start_main [0x3ad85]
========= Host Frame:__libc_start_main [0x3ad85]
========= in /usr/lib64/libc.so.6
========= in /usr/lib64/libc.so.6
========= Host Frame:_start [0x360e]
========= Host Frame:_start [0x360e]
========= in /p/project/cexalab/john2/task_graph/./jacobi
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
........
========= Invalid __global__ write of size 8 bytes
========= at 0x8e10 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *)
========= by thread (240,0,0) in block (0,0,0)
========= Address 0x154f5fa00000 is out of bounds
========= and is 50,331,648 bytes before the nearest allocation at 0x154f62a00000 of size 6,291,456 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x319c12]
========= in /usr/lib64/libcuda.so.1
========= Host Frame:__cudart808 [0xdea9b]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:cudaLaunchKernel [0x13a238]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
========= Host Frame:__libc_start_main [0x3ad85]
========= in /usr/lib64/libc.so.6
========= Host Frame:_start [0x360e]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
=========
========= Invalid __global__ write of size 16 bytes
========= at 0x70f0 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *)
========= by thread (320,0,0) in block (1,0,0)
========= Address 0x154f5d001000 is out of bounds
========= and is 6,295,553 bytes after the nearest allocation at 0x154f5c400000 of size 6,291,456 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x319c12]
========= in /usr/lib64/libcuda.so.1
========= Host Frame:__cudart808 [0xdea9b]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:cudaLaunchKernel [0x13a238]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
========= Host Frame:__libc_start_main [0x3ad85]
========= in /usr/lib64/libc.so.6
========= Host Frame:_start [0x360e]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
=========
```
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment