Skip to content
Snippets Groups Projects
Commit d6e73441 authored by Chelsea Maria John's avatar Chelsea Maria John
Browse files

add valgrind

parent c1953817
Branches main
No related tags found
No related merge requests found
...@@ -26,10 +26,12 @@ Modify `NCCL`,`MPICXX` and `CUDA` paths in `Makefile` for running on different s ...@@ -26,10 +26,12 @@ Modify `NCCL`,`MPICXX` and `CUDA` paths in `Makefile` for running on different s
module purge module purge
module load Stages/2023 module load Stages/2023
module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Systems/2023.2.1 MPI-settings/CUDA-UCC module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 MPI-settings/CUDA-UCC
# NCCL Version 2.15.1 module load Nsight-Systems/2023.2.1 Valgrind/3.19.0
# UCC Version=1.1.0 # UCC Version=1.1.0
# UCX Version 1.13.1 # UCX Version 1.13.1
# NCCL Version 2.15.1
# Valgrind Version 3.19.0
# All variant have the following command line options # All variant have the following command line options
...@@ -43,17 +45,17 @@ module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Sys ...@@ -43,17 +45,17 @@ module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Sys
NXNY="20480" NXNY="20480"
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0,1,2,3
srun ./jacobi -niter 10 -nx ${NXNY} -ny ${NXNY} 2>&1 | tee -a debug_log.txt srun valgrind --tool=memcheck --verbose --time-stamp=yes \
--log-file=valgrind.err --leak-check=summary --track-origins=yes \
./jacobi -niter 10 -nx ${NXNY} -ny ${NXNY}
``` ```
## Error ## Error
``` ```
Single GPU jacobi relaxation: 10 iterations on 20480 x 20480 mesh with norm check every 1 iterations [jrc0437:20114:0:20114] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x1b2000000)
0, 35.776176 ==== backtrace (tid: 20112) ====
[jrc0438:2954 :0:2954] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x14a63a000000)
==== backtrace (tid: 2956) ====
0 0x000000000004eb50 killpg() ???:0 0 0x000000000004eb50 killpg() ???:0
1 0x0000000000221af5 cuEGLApiInit() ???:0 1 0x0000000000221af5 cuEGLApiInit() ???:0
2 0x0000000000238d90 cuEGLApiInit() ???:0 2 0x0000000000238d90 cuEGLApiInit() ???:0
...@@ -61,117 +63,25 @@ Single GPU jacobi relaxation: 10 iterations on 20480 x 20480 mesh with norm chec ...@@ -61,117 +63,25 @@ Single GPU jacobi relaxation: 10 iterations on 20480 x 20480 mesh with norm chec
4 0x000000000031deb5 cuMemMapArrayAsync() ???:0 4 0x000000000031deb5 cuMemMapArrayAsync() ???:0
5 0x000000000001d115 ???() /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0:0 5 0x000000000001d115 ???() /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0:0
6 0x000000000005be34 cudaGraphAddKernelNode() ???:0 6 0x000000000005be34 cudaGraphAddKernelNode() ???:0
7 0x00000000004058db main() /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:414 7 0x0000000000405947 main() /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:423
8 0x000000000003ad85 __libc_start_main() ???:0 8 0x000000000003ad85 __libc_start_main() ???:0
9 0x000000000040360e _start() ???:0 9 0x000000000040360e _start() ???:0
================================= =================================
[jrc0438:02956] *** Process received signal *** [jrc0437:20112] *** Process received signal ***
[jrc0438:02956] Signal: Segmentation fault (11) [jrc0437:20112] Signal: Segmentation fault (11)
[jrc0438:02956] Signal code: (-6) [jrc0437:20112] Signal code: (-6)
[jrc0438:02956] Failing at address: 0x448100000b8c [jrc0437:20112] Failing at address: 0x448100004e90
[jrc0438:02956] [ 0] /usr/lib64/libc.so.6(+0x4eb50)[0x14d3bb145b50] [jrc0437:20112] [ 0] /usr/lib64/libc.so.6(+0x4eb50)[0x9671b50]
[jrc0438:02956] [ 1] /usr/lib64/libcuda.so.1(+0x221af5)[0x14d3b2608af5] [jrc0437:20112] [ 1] /usr/lib64/libc.so.6(gsignal+0xed)[0x9671aad]
[jrc0438:02956] [ 2] /usr/lib64/libcuda.so.1(+0x238d90)[0x14d3b261fd90] [jrc0437:20112] [ 2] /usr/lib64/libc.so.6(+0x4eb50)[0x9671b50]
[jrc0438:02956] [ 3] /usr/lib64/libcuda.so.1(+0x238efd)[0x14d3b261fefd] [jrc0437:20112] [ 3] /usr/lib64/libcuda.so.1(+0x221af5)[0xc223af5]
[jrc0438:02956] [ 4] /usr/lib64/libcuda.so.1(+0x31deb5)[0x14d3b2704eb5] [jrc0437:20112] [ 4] /usr/lib64/libcuda.so.1(+0x238d90)[0xc23ad90]
[jrc0438:02956] [ 5] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x14d3bfe26115] [jrc0437:20112] [ 5] /usr/lib64/libcuda.so.1(+0x238efd)[0xc23aefd]
[jrc0438:02956] [ 6] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x14d3bfe64e34] [jrc0437:20112] [ 6] /usr/lib64/libcuda.so.1(+0x31deb5)[0xc31feb5]
[jrc0438:02956] [ 7] ./jacobi[0x4058db] [jrc0437:20112] [ 7] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x4a4e115]
[jrc0438:02956] [ 8] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x14d3bb131d85] [jrc0437:20112] [ 8] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x4a8ce34]
[jrc0438:02956] [ 9] ./jacobi[0x40360e] [jrc0437:20112] [ 9] ./jacobi[0x405947]
[jrc0438:02956] *** End of error message *** [jrc0437:20112] [10] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x965dd85]
``` [jrc0437:20112] [11] ./jacobi[0x40360e]
[jrc0437:20112] *** End of error message ***
## Part of Compute-sanitizer log
```
========= COMPUTE-SANITIZER
========= Program hit CUDA_ERROR_INVALID_CONTEXT (error 201) due to "invalid device context" on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Host Frame: [0x2b8c71]
========= in /usr/lib64/libcuda.so.1
========= Host Frame:base/cuda_iface.c:22:uct_cuda_base_query_devices_common [0x6c45]
========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/ucx/libuct_cuda.so.0
========= Host Frame:base/uct_md.c:115:uct_md_query_tl_resources [0x134f6]
========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libuct.so.0
========= Host Frame:core/ucp_context.c:1332:ucp_add_component_resources [0x205dc]
========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libucp.so.0
========= Host Frame:core/ucp_context.c:1470:ucp_fill_resources [0x2144f]
========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libucp.so.0
========= Host Frame:core/ucp_context.c:1886:ucp_init_version [0x2295f]
========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libucp.so.0
========= Host Frame:mca_pml_ucx_open [0x7402]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/openmpi/mca_pml_ucx.so
========= Host Frame:mca_base_framework_components_open [0x50614]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libopen-pal.so.40
========= Host Frame:mca_pml_base_open [0xcce4f]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
========= Host Frame:mca_base_framework_open [0x59a24]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libopen-pal.so.40
========= Host Frame:ompi_mpi_init [0xdcb84]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
========= Host Frame:MPI_Init [0x7af8e]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:177:main [0x3782]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
========= Host Frame:__libc_start_main [0x3ad85]
========= in /usr/lib64/libc.so.6
========= Host Frame:_start [0x360e]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
........
========= Invalid __global__ write of size 8 bytes
========= at 0x8e10 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *)
========= by thread (240,0,0) in block (0,0,0)
========= Address 0x154f5fa00000 is out of bounds
========= and is 50,331,648 bytes before the nearest allocation at 0x154f62a00000 of size 6,291,456 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x319c12]
========= in /usr/lib64/libcuda.so.1
========= Host Frame:__cudart808 [0xdea9b]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:cudaLaunchKernel [0x13a238]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
========= Host Frame:__libc_start_main [0x3ad85]
========= in /usr/lib64/libc.so.6
========= Host Frame:_start [0x360e]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
=========
========= Invalid __global__ write of size 16 bytes
========= at 0x70f0 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *)
========= by thread (320,0,0) in block (1,0,0)
========= Address 0x154f5d001000 is out of bounds
========= and is 6,295,553 bytes after the nearest allocation at 0x154f5c400000 of size 6,291,456 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x319c12]
========= in /usr/lib64/libcuda.so.1
========= Host Frame:__cudart808 [0xdea9b]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:cudaLaunchKernel [0x13a238]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
========= Host Frame:__libc_start_main [0x3ad85]
========= in /usr/lib64/libc.so.6
========= Host Frame:_start [0x360e]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
=========
``` ```
...@@ -404,18 +404,32 @@ int main(int argc, char* argv[]) { ...@@ -404,18 +404,32 @@ int main(int argc, char* argv[]) {
nodeDependencies.push_back(jacobiMemsetNode); //manage dependency vector nodeDependencies.push_back(jacobiMemsetNode); //manage dependency vector
cudaDeviceSynchronize(); cudaDeviceSynchronize();
void *kernelArgsNode1[7] = {(void *)a_new, (void *)a, (void *)l2_norm_d, (void *)&new_iy_start, (void *)&new_iy_start_neigh , (void *)&new_nx, (void *)&new_calculate_norm}; void *kernelArgsNode1[7] = {
(void *)a_new, (void *)a,
(void *)l2_norm_d, (void *)&new_iy_start,
(void *)&new_iy_start_neigh, (void *)&new_nx,
(void *)&new_calculate_norm,
};
jacobiKernelNode1Params.func = (void *)jacobi_kernel_proxy; jacobiKernelNode1Params.func = (void *)jacobi_kernel_proxy;
jacobiKernelNode1Params.blockDim = dim3(dim_block_x, dim_block_y, 1); jacobiKernelNode1Params.blockDim = dim3(dim_block_x, dim_block_y, 1);
jacobiKernelNode1Params.gridDim = dim3((nx + dim_block_x - 1) / dim_block_x,((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); jacobiKernelNode1Params.gridDim = dim3((nx + dim_block_x - 1) / dim_block_x,((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
jacobiKernelNode1Params.sharedMemBytes = 0; jacobiKernelNode1Params.sharedMemBytes = 0;
jacobiKernelNode1Params.kernelParams = kernelArgsNode1; jacobiKernelNode1Params.kernelParams = kernelArgsNode1;
jacobiKernelNode1Params.extra = NULL; jacobiKernelNode1Params.extra = NULL;
for (std::size_t i = 0; i < 8; i++)
{
fprintf(stderr,"Kernel Arg %d: %p\n", i, kernelArgsNode1[i]);
}
CUDA_RT_CALL(cudaGraphAddKernelNode(&jacobiKernelNode1, graphs[calculate_norm][is_even], nodeDependencies.data(), CUDA_RT_CALL(cudaGraphAddKernelNode(&jacobiKernelNode1, graphs[calculate_norm][is_even], nodeDependencies.data(),
nodeDependencies.size(),&jacobiKernelNode1Params)); nodeDependencies.size(),&jacobiKernelNode1Params));
void *kernelArgsNode2[7] = {(void *)a_new, (void *)a, (void *)l2_norm_d, (void *)&new_iy_end_neigh, (void *)&new_iy_end, (void *)&new_nx, (void *)&new_calculate_norm}; void *kernelArgsNode2[7] = {
(void *)a_new, (void *)a,
(void *)l2_norm_d, (void *)&new_iy_end_neigh,
(void *)&new_iy_end, (void *)&new_nx,
(void *)&new_calculate_norm
};
jacobiKernelNode2Params.func = (void *)jacobi_kernel_proxy; jacobiKernelNode2Params.func = (void *)jacobi_kernel_proxy;
jacobiKernelNode2Params.blockDim = dim3(dim_block_x, dim_block_y, 1); jacobiKernelNode2Params.blockDim = dim3(dim_block_x, dim_block_y, 1);
jacobiKernelNode2Params.gridDim = dim3((nx + dim_block_x - 1) / dim_block_x,((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); jacobiKernelNode2Params.gridDim = dim3((nx + dim_block_x - 1) / dim_block_x,((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
...@@ -471,8 +485,12 @@ int main(int argc, char* argv[]) { ...@@ -471,8 +485,12 @@ int main(int argc, char* argv[]) {
nodeDependencies.push_back(jacobiMemsetNode); nodeDependencies.push_back(jacobiMemsetNode);
nodeDependencies.push_back(ncclNode); nodeDependencies.push_back(ncclNode);
void *kernelArgsNode3[7] = {(void *)&a_new, (void *)&a,(void *)&l2_norm_d, (void *)&new_iy_start_neigh,(void *)&new_iy_end_neigh, void *kernelArgsNode3[7] = {
(void *)&new_nx,(void *)&new_calculate_norm}; (void *)&a_new, (void *)&a,
(void *)&l2_norm_d, (void *)&new_iy_start_neigh,
(void *)&new_iy_end_neigh,(void *)&new_nx,
(void *)&new_calculate_norm
};
jacobiKernelNode3Params.func = (void *)jacobi_kernel_proxy; jacobiKernelNode3Params.func = (void *)jacobi_kernel_proxy;
jacobiKernelNode3Params.blockDim = dim3(dim_block_x, dim_block_y, 1); jacobiKernelNode3Params.blockDim = dim3(dim_block_x, dim_block_y, 1);
jacobiKernelNode3Params.gridDim = dim3((nx + dim_block_x - 1) / dim_block_x,((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); jacobiKernelNode3Params.gridDim = dim3((nx + dim_block_x - 1) / dim_block_x,((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment