diff --git a/README.md b/README.md index 5bf5f55e19850c7f162c550d3aff3fc103823078..879e66f9c987907e89df32eb565e16861ae0f4e4 100644 --- a/README.md +++ b/README.md @@ -26,10 +26,12 @@ Modify `NCCL`,`MPICXX` and `CUDA` paths in `Makefile` for running on different s module purge module load Stages/2023 -module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Systems/2023.2.1 MPI-settings/CUDA-UCC -# NCCL Version 2.15.1 +module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 MPI-settings/CUDA-UCC +module load Nsight-Systems/2023.2.1 Valgrind/3.19.0 # UCC Version=1.1.0 # UCX Version 1.13.1 +# NCCL Version 2.15.1 +# Valgrind Version 3.19.0 # All variant have the following command line options @@ -43,17 +45,17 @@ module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Sys NXNY="20480" export CUDA_VISIBLE_DEVICES=0,1,2,3 -srun ./jacobi -niter 10 -nx ${NXNY} -ny ${NXNY} 2>&1 | tee -a debug_log.txt +srun valgrind --tool=memcheck --verbose --time-stamp=yes \ + --log-file=valgrind.err --leak-check=summary --track-origins=yes \ + ./jacobi -niter 10 -nx ${NXNY} -ny ${NXNY} ``` ## Error ``` -Single GPU jacobi relaxation: 10 iterations on 20480 x 20480 mesh with norm check every 1 iterations - 0, 35.776176 -[jrc0438:2954 :0:2954] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x14a63a000000) -==== backtrace (tid: 2956) ==== +[jrc0437:20114:0:20114] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x1b2000000) +==== backtrace (tid: 20112) ==== 0 0x000000000004eb50 killpg() ???:0 1 0x0000000000221af5 cuEGLApiInit() ???:0 2 0x0000000000238d90 cuEGLApiInit() ???:0 @@ -61,117 +63,25 @@ Single GPU jacobi relaxation: 10 iterations on 20480 x 20480 mesh with norm chec 4 0x000000000031deb5 cuMemMapArrayAsync() ???:0 5 0x000000000001d115 ???() /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0:0 6 0x000000000005be34 cudaGraphAddKernelNode() ???:0 - 7 0x00000000004058db main() /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:414 + 7 0x0000000000405947 main() /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:423 8 0x000000000003ad85 __libc_start_main() ???:0 9 0x000000000040360e _start() ???:0 ================================= -[jrc0438:02956] *** Process received signal *** -[jrc0438:02956] Signal: Segmentation fault (11) -[jrc0438:02956] Signal code: (-6) -[jrc0438:02956] Failing at address: 0x448100000b8c -[jrc0438:02956] [ 0] /usr/lib64/libc.so.6(+0x4eb50)[0x14d3bb145b50] -[jrc0438:02956] [ 1] /usr/lib64/libcuda.so.1(+0x221af5)[0x14d3b2608af5] -[jrc0438:02956] [ 2] /usr/lib64/libcuda.so.1(+0x238d90)[0x14d3b261fd90] -[jrc0438:02956] [ 3] /usr/lib64/libcuda.so.1(+0x238efd)[0x14d3b261fefd] -[jrc0438:02956] [ 4] /usr/lib64/libcuda.so.1(+0x31deb5)[0x14d3b2704eb5] -[jrc0438:02956] [ 5] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x14d3bfe26115] -[jrc0438:02956] [ 6] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x14d3bfe64e34] -[jrc0438:02956] [ 7] ./jacobi[0x4058db] -[jrc0438:02956] [ 8] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x14d3bb131d85] -[jrc0438:02956] [ 9] ./jacobi[0x40360e] -[jrc0438:02956] *** End of error message *** -``` - -## Part of Compute-sanitizer log - -``` -========= COMPUTE-SANITIZER -========= Program hit CUDA_ERROR_INVALID_CONTEXT (error 201) due to "invalid device context" on CUDA API call to cuCtxGetDevice. -========= Saved host backtrace up to driver entry point at error -========= Host Frame: [0x2b8c71] -========= in /usr/lib64/libcuda.so.1 -========= Host Frame:base/cuda_iface.c:22:uct_cuda_base_query_devices_common [0x6c45] -========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/ucx/libuct_cuda.so.0 -========= Host Frame:base/uct_md.c:115:uct_md_query_tl_resources [0x134f6] -========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libuct.so.0 -========= Host Frame:core/ucp_context.c:1332:ucp_add_component_resources [0x205dc] -========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libucp.so.0 -========= Host Frame:core/ucp_context.c:1470:ucp_fill_resources [0x2144f] -========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libucp.so.0 -========= Host Frame:core/ucp_context.c:1886:ucp_init_version [0x2295f] -========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libucp.so.0 -========= Host Frame:mca_pml_ucx_open [0x7402] -========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/openmpi/mca_pml_ucx.so -========= Host Frame:mca_base_framework_components_open [0x50614] -========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libopen-pal.so.40 -========= Host Frame:mca_pml_base_open [0xcce4f] -========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40 -========= Host Frame:mca_base_framework_open [0x59a24] -========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libopen-pal.so.40 -========= Host Frame:ompi_mpi_init [0xdcb84] -========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40 -========= Host Frame:MPI_Init [0x7af8e] -========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40 -========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:177:main [0x3782] -========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi -========= Host Frame:__libc_start_main [0x3ad85] -========= in /usr/lib64/libc.so.6 -========= Host Frame:_start [0x360e] -========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi - -........ -========= Invalid __global__ write of size 8 bytes -========= at 0x8e10 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *) -========= by thread (240,0,0) in block (0,0,0) -========= Address 0x154f5fa00000 is out of bounds -========= and is 50,331,648 bytes before the nearest allocation at 0x154f62a00000 of size 6,291,456 bytes -========= Saved host backtrace up to driver entry point at kernel launch time -========= Host Frame: [0x319c12] -========= in /usr/lib64/libcuda.so.1 -========= Host Frame:__cudart808 [0xdea9b] -========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 -========= Host Frame:cudaLaunchKernel [0x13a238] -========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 -========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d] -========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 -========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f] -========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 -========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8] -========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 -========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179] -========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 -========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3] -========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi -========= Host Frame:__libc_start_main [0x3ad85] -========= in /usr/lib64/libc.so.6 -========= Host Frame:_start [0x360e] -========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi -========= -========= Invalid __global__ write of size 16 bytes -========= at 0x70f0 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *) -========= by thread (320,0,0) in block (1,0,0) -========= Address 0x154f5d001000 is out of bounds -========= and is 6,295,553 bytes after the nearest allocation at 0x154f5c400000 of size 6,291,456 bytes -========= Saved host backtrace up to driver entry point at kernel launch time -========= Host Frame: [0x319c12] -========= in /usr/lib64/libcuda.so.1 -========= Host Frame:__cudart808 [0xdea9b] -========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 -========= Host Frame:cudaLaunchKernel [0x13a238] -========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 -========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d] -========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 -========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f] -========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 -========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8] -========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 -========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179] -========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2 -========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3] -========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi -========= Host Frame:__libc_start_main [0x3ad85] -========= in /usr/lib64/libc.so.6 -========= Host Frame:_start [0x360e] -========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi -========= +[jrc0437:20112] *** Process received signal *** +[jrc0437:20112] Signal: Segmentation fault (11) +[jrc0437:20112] Signal code: (-6) +[jrc0437:20112] Failing at address: 0x448100004e90 +[jrc0437:20112] [ 0] /usr/lib64/libc.so.6(+0x4eb50)[0x9671b50] +[jrc0437:20112] [ 1] /usr/lib64/libc.so.6(gsignal+0xed)[0x9671aad] +[jrc0437:20112] [ 2] /usr/lib64/libc.so.6(+0x4eb50)[0x9671b50] +[jrc0437:20112] [ 3] /usr/lib64/libcuda.so.1(+0x221af5)[0xc223af5] +[jrc0437:20112] [ 4] /usr/lib64/libcuda.so.1(+0x238d90)[0xc23ad90] +[jrc0437:20112] [ 5] /usr/lib64/libcuda.so.1(+0x238efd)[0xc23aefd] +[jrc0437:20112] [ 6] /usr/lib64/libcuda.so.1(+0x31deb5)[0xc31feb5] +[jrc0437:20112] [ 7] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x4a4e115] +[jrc0437:20112] [ 8] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x4a8ce34] +[jrc0437:20112] [ 9] ./jacobi[0x405947] +[jrc0437:20112] [10] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x965dd85] +[jrc0437:20112] [11] ./jacobi[0x40360e] +[jrc0437:20112] *** End of error message *** ``` diff --git a/graph_wo_streams.cu b/graph_wo_streams.cu index d8f62615444305e87d3d6ff5713b54eda5d7fb08..26025967a2b68fac517604c1407f03e8dab74c46 100644 --- a/graph_wo_streams.cu +++ b/graph_wo_streams.cu @@ -404,18 +404,32 @@ int main(int argc, char* argv[]) { nodeDependencies.push_back(jacobiMemsetNode); //manage dependency vector cudaDeviceSynchronize(); - void *kernelArgsNode1[7] = {(void *)a_new, (void *)a, (void *)l2_norm_d, (void *)&new_iy_start, (void *)&new_iy_start_neigh , (void *)&new_nx, (void *)&new_calculate_norm}; + void *kernelArgsNode1[7] = { + (void *)a_new, (void *)a, + (void *)l2_norm_d, (void *)&new_iy_start, + (void *)&new_iy_start_neigh, (void *)&new_nx, + (void *)&new_calculate_norm, + }; jacobiKernelNode1Params.func = (void *)jacobi_kernel_proxy; jacobiKernelNode1Params.blockDim = dim3(dim_block_x, dim_block_y, 1); jacobiKernelNode1Params.gridDim = dim3((nx + dim_block_x - 1) / dim_block_x,((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); jacobiKernelNode1Params.sharedMemBytes = 0; jacobiKernelNode1Params.kernelParams = kernelArgsNode1; jacobiKernelNode1Params.extra = NULL; + for (std::size_t i = 0; i < 8; i++) + { + fprintf(stderr,"Kernel Arg %d: %p\n", i, kernelArgsNode1[i]); + } CUDA_RT_CALL(cudaGraphAddKernelNode(&jacobiKernelNode1, graphs[calculate_norm][is_even], nodeDependencies.data(), nodeDependencies.size(),&jacobiKernelNode1Params)); - void *kernelArgsNode2[7] = {(void *)a_new, (void *)a, (void *)l2_norm_d, (void *)&new_iy_end_neigh, (void *)&new_iy_end, (void *)&new_nx, (void *)&new_calculate_norm}; + void *kernelArgsNode2[7] = { + (void *)a_new, (void *)a, + (void *)l2_norm_d, (void *)&new_iy_end_neigh, + (void *)&new_iy_end, (void *)&new_nx, + (void *)&new_calculate_norm + }; jacobiKernelNode2Params.func = (void *)jacobi_kernel_proxy; jacobiKernelNode2Params.blockDim = dim3(dim_block_x, dim_block_y, 1); jacobiKernelNode2Params.gridDim = dim3((nx + dim_block_x - 1) / dim_block_x,((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); @@ -471,8 +485,12 @@ int main(int argc, char* argv[]) { nodeDependencies.push_back(jacobiMemsetNode); nodeDependencies.push_back(ncclNode); - void *kernelArgsNode3[7] = {(void *)&a_new, (void *)&a,(void *)&l2_norm_d, (void *)&new_iy_start_neigh,(void *)&new_iy_end_neigh, - (void *)&new_nx,(void *)&new_calculate_norm}; + void *kernelArgsNode3[7] = { + (void *)&a_new, (void *)&a, + (void *)&l2_norm_d, (void *)&new_iy_start_neigh, + (void *)&new_iy_end_neigh,(void *)&new_nx, + (void *)&new_calculate_norm + }; jacobiKernelNode3Params.func = (void *)jacobi_kernel_proxy; jacobiKernelNode3Params.blockDim = dim3(dim_block_x, dim_block_y, 1); jacobiKernelNode3Params.gridDim = dim3((nx + dim_block_x - 1) / dim_block_x,((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1);