Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
CUDA-NCCL TaskGraph
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Chelsea Maria John
CUDA-NCCL TaskGraph
Commits
d6e73441
Commit
d6e73441
authored
1 year ago
by
Chelsea Maria John
Browse files
Options
Downloads
Patches
Plain Diff
add valgrind
parent
c1953817
Branches
main
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
README.md
+27
-117
27 additions, 117 deletions
README.md
graph_wo_streams.cu
+22
-4
22 additions, 4 deletions
graph_wo_streams.cu
with
49 additions
and
121 deletions
README.md
+
27
−
117
View file @
d6e73441
...
@@ -26,10 +26,12 @@ Modify `NCCL`,`MPICXX` and `CUDA` paths in `Makefile` for running on different s
...
@@ -26,10 +26,12 @@ Modify `NCCL`,`MPICXX` and `CUDA` paths in `Makefile` for running on different s
module purge
module purge
module load Stages/2023
module load Stages/2023
module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7
Nsight-Systems/2023.2.1
MPI-settings/CUDA-UCC
module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 MPI-settings/CUDA-UCC
# NCCL Version 2.15.1
module load Nsight-Systems/2023.2.1 Valgrind/3.19.0
# UCC Version=1.1.0
# UCC Version=1.1.0
# UCX Version 1.13.1
# UCX Version 1.13.1
# NCCL Version 2.15.1
# Valgrind Version 3.19.0
# All variant have the following command line options
# All variant have the following command line options
...
@@ -43,17 +45,17 @@ module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Sys
...
@@ -43,17 +45,17 @@ module load GCC/11.3.0 CUDA/11.7 OpenMPI/4.1.4 NCCL/default-CUDA-11.7 Nsight-Sys
NXNY
=
"20480"
NXNY
=
"20480"
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3
srun ./jacobi
-niter
10
-nx
${
NXNY
}
-ny
${
NXNY
}
2>&1 |
tee
-a
debug_log.txt
srun valgrind
--tool
=
memcheck
--verbose
--time-stamp
=
yes
\
--log-file
=
valgrind.err
--leak-check
=
summary
--track-origins
=
yes
\
./jacobi
-niter
10
-nx
${
NXNY
}
-ny
${
NXNY
}
```
```
## Error
## Error
```
```
Single GPU jacobi relaxation: 10 iterations on 20480 x 20480 mesh with norm check every 1 iterations
[jrc0437:20114:0:20114] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x1b2000000)
0, 35.776176
==== backtrace (tid: 20112) ====
[jrc0438:2954 :0:2954] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x14a63a000000)
==== backtrace (tid: 2956) ====
0 0x000000000004eb50 killpg() ???:0
0 0x000000000004eb50 killpg() ???:0
1 0x0000000000221af5 cuEGLApiInit() ???:0
1 0x0000000000221af5 cuEGLApiInit() ???:0
2 0x0000000000238d90 cuEGLApiInit() ???:0
2 0x0000000000238d90 cuEGLApiInit() ???:0
...
@@ -61,117 +63,25 @@ Single GPU jacobi relaxation: 10 iterations on 20480 x 20480 mesh with norm chec
...
@@ -61,117 +63,25 @@ Single GPU jacobi relaxation: 10 iterations on 20480 x 20480 mesh with norm chec
4 0x000000000031deb5 cuMemMapArrayAsync() ???:0
4 0x000000000031deb5 cuMemMapArrayAsync() ???:0
5 0x000000000001d115 ???() /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0:0
5 0x000000000001d115 ???() /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0:0
6 0x000000000005be34 cudaGraphAddKernelNode() ???:0
6 0x000000000005be34 cudaGraphAddKernelNode() ???:0
7 0x0000000000405
8db
main() /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:4
14
7 0x0000000000405
947
main() /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:4
23
8 0x000000000003ad85 __libc_start_main() ???:0
8 0x000000000003ad85 __libc_start_main() ???:0
9 0x000000000040360e _start() ???:0
9 0x000000000040360e _start() ???:0
=================================
=================================
[jrc0438:02956] *** Process received signal ***
[jrc0437:20112] *** Process received signal ***
[jrc0438:02956] Signal: Segmentation fault (11)
[jrc0437:20112] Signal: Segmentation fault (11)
[jrc0438:02956] Signal code: (-6)
[jrc0437:20112] Signal code: (-6)
[jrc0438:02956] Failing at address: 0x448100000b8c
[jrc0437:20112] Failing at address: 0x448100004e90
[jrc0438:02956] [ 0] /usr/lib64/libc.so.6(+0x4eb50)[0x14d3bb145b50]
[jrc0437:20112] [ 0] /usr/lib64/libc.so.6(+0x4eb50)[0x9671b50]
[jrc0438:02956] [ 1] /usr/lib64/libcuda.so.1(+0x221af5)[0x14d3b2608af5]
[jrc0437:20112] [ 1] /usr/lib64/libc.so.6(gsignal+0xed)[0x9671aad]
[jrc0438:02956] [ 2] /usr/lib64/libcuda.so.1(+0x238d90)[0x14d3b261fd90]
[jrc0437:20112] [ 2] /usr/lib64/libc.so.6(+0x4eb50)[0x9671b50]
[jrc0438:02956] [ 3] /usr/lib64/libcuda.so.1(+0x238efd)[0x14d3b261fefd]
[jrc0437:20112] [ 3] /usr/lib64/libcuda.so.1(+0x221af5)[0xc223af5]
[jrc0438:02956] [ 4] /usr/lib64/libcuda.so.1(+0x31deb5)[0x14d3b2704eb5]
[jrc0437:20112] [ 4] /usr/lib64/libcuda.so.1(+0x238d90)[0xc23ad90]
[jrc0438:02956] [ 5] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x14d3bfe26115]
[jrc0437:20112] [ 5] /usr/lib64/libcuda.so.1(+0x238efd)[0xc23aefd]
[jrc0438:02956] [ 6] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x14d3bfe64e34]
[jrc0437:20112] [ 6] /usr/lib64/libcuda.so.1(+0x31deb5)[0xc31feb5]
[jrc0438:02956] [ 7] ./jacobi[0x4058db]
[jrc0437:20112] [ 7] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(+0x1d115)[0x4a4e115]
[jrc0438:02956] [ 8] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x14d3bb131d85]
[jrc0437:20112] [ 8] /p/software/jurecadc/stages/2023/software/CUDA/11.7/lib/libcudart.so.11.0(cudaGraphAddKernelNode+0x204)[0x4a8ce34]
[jrc0438:02956] [ 9] ./jacobi[0x40360e]
[jrc0437:20112] [ 9] ./jacobi[0x405947]
[jrc0438:02956] *** End of error message ***
[jrc0437:20112] [10] /usr/lib64/libc.so.6(__libc_start_main+0xe5)[0x965dd85]
```
[jrc0437:20112] [11] ./jacobi[0x40360e]
[jrc0437:20112] *** End of error message ***
## Part of Compute-sanitizer log
```
========= COMPUTE-SANITIZER
========= Program hit CUDA_ERROR_INVALID_CONTEXT (error 201) due to "invalid device context" on CUDA API call to cuCtxGetDevice.
========= Saved host backtrace up to driver entry point at error
========= Host Frame: [0x2b8c71]
========= in /usr/lib64/libcuda.so.1
========= Host Frame:base/cuda_iface.c:22:uct_cuda_base_query_devices_common [0x6c45]
========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/ucx/libuct_cuda.so.0
========= Host Frame:base/uct_md.c:115:uct_md_query_tl_resources [0x134f6]
========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libuct.so.0
========= Host Frame:core/ucp_context.c:1332:ucp_add_component_resources [0x205dc]
========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libucp.so.0
========= Host Frame:core/ucp_context.c:1470:ucp_fill_resources [0x2144f]
========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libucp.so.0
========= Host Frame:core/ucp_context.c:1886:ucp_init_version [0x2295f]
========= in /p/software/jurecadc/stages/2023/software/UCX/default-GCCcore-11.3.0/lib/libucp.so.0
========= Host Frame:mca_pml_ucx_open [0x7402]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/openmpi/mca_pml_ucx.so
========= Host Frame:mca_base_framework_components_open [0x50614]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libopen-pal.so.40
========= Host Frame:mca_pml_base_open [0xcce4f]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
========= Host Frame:mca_base_framework_open [0x59a24]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libopen-pal.so.40
========= Host Frame:ompi_mpi_init [0xdcb84]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
========= Host Frame:MPI_Init [0x7af8e]
========= in /p/software/jurecadc/stages/2023/software/OpenMPI/4.1.4-GCC-11.3.0/lib/libmpi.so.40
========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:177:main [0x3782]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
========= Host Frame:__libc_start_main [0x3ad85]
========= in /usr/lib64/libc.so.6
========= Host Frame:_start [0x360e]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
........
========= Invalid __global__ write of size 8 bytes
========= at 0x8e10 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *)
========= by thread (240,0,0) in block (0,0,0)
========= Address 0x154f5fa00000 is out of bounds
========= and is 50,331,648 bytes before the nearest allocation at 0x154f62a00000 of size 6,291,456 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x319c12]
========= in /usr/lib64/libcuda.so.1
========= Host Frame:__cudart808 [0xdea9b]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:cudaLaunchKernel [0x13a238]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
========= Host Frame:__libc_start_main [0x3ad85]
========= in /usr/lib64/libc.so.6
========= Host Frame:_start [0x360e]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
=========
========= Invalid __global__ write of size 16 bytes
========= at 0x70f0 in ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_t(ncclDevComm *, unsigned long, ncclWork *)
========= by thread (320,0,0) in block (1,0,0)
========= Address 0x154f5d001000 is out of bounds
========= and is 6,295,553 bytes after the nearest allocation at 0x154f5c400000 of size 6,291,456 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x319c12]
========= in /usr/lib64/libcuda.so.1
========= Host Frame:__cudart808 [0xdea9b]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:cudaLaunchKernel [0x13a238]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/enqueue.cc:1068:ncclLaunchKernel(ncclComm*, ncclKernelPlan*) [0x5f27d]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:340:groupLaunch(ncclAsyncJob*) [0x63f8f]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:376:ncclGroupEndInternal() [0x64ae8]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/dev/shm/swmanage/jurecadc/NCCL/default/GCCcore-11.3.0-CUDA-11.7/nccl/src/group.cc:106:ncclGroupEnd [0x65179]
========= in /p/software/jurecadc/stages/2023/software/NCCL/default-GCCcore-11.3.0-CUDA-11.7/lib/libnccl.so.2
========= Host Frame:/p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/graph_wo_streams.cu:310:main [0x50f3]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
========= Host Frame:__libc_start_main [0x3ad85]
========= in /usr/lib64/libc.so.6
========= Host Frame:_start [0x360e]
========= in /p/project/cexalab/john2/task_graph/cuda-nccl-taskgraph/./jacobi
=========
```
```
This diff is collapsed.
Click to expand it.
graph_wo_streams.cu
+
22
−
4
View file @
d6e73441
...
@@ -404,18 +404,32 @@ int main(int argc, char* argv[]) {
...
@@ -404,18 +404,32 @@ int main(int argc, char* argv[]) {
nodeDependencies
.
push_back
(
jacobiMemsetNode
);
//manage dependency vector
nodeDependencies
.
push_back
(
jacobiMemsetNode
);
//manage dependency vector
cudaDeviceSynchronize
();
cudaDeviceSynchronize
();
void
*
kernelArgsNode1
[
7
]
=
{(
void
*
)
a_new
,
(
void
*
)
a
,
(
void
*
)
l2_norm_d
,
(
void
*
)
&
new_iy_start
,
(
void
*
)
&
new_iy_start_neigh
,
(
void
*
)
&
new_nx
,
(
void
*
)
&
new_calculate_norm
};
void
*
kernelArgsNode1
[
7
]
=
{
(
void
*
)
a_new
,
(
void
*
)
a
,
(
void
*
)
l2_norm_d
,
(
void
*
)
&
new_iy_start
,
(
void
*
)
&
new_iy_start_neigh
,
(
void
*
)
&
new_nx
,
(
void
*
)
&
new_calculate_norm
,
};
jacobiKernelNode1Params
.
func
=
(
void
*
)
jacobi_kernel_proxy
;
jacobiKernelNode1Params
.
func
=
(
void
*
)
jacobi_kernel_proxy
;
jacobiKernelNode1Params
.
blockDim
=
dim3
(
dim_block_x
,
dim_block_y
,
1
);
jacobiKernelNode1Params
.
blockDim
=
dim3
(
dim_block_x
,
dim_block_y
,
1
);
jacobiKernelNode1Params
.
gridDim
=
dim3
((
nx
+
dim_block_x
-
1
)
/
dim_block_x
,((
iy_end
-
iy_start
)
+
dim_block_y
-
1
)
/
dim_block_y
,
1
);
jacobiKernelNode1Params
.
gridDim
=
dim3
((
nx
+
dim_block_x
-
1
)
/
dim_block_x
,((
iy_end
-
iy_start
)
+
dim_block_y
-
1
)
/
dim_block_y
,
1
);
jacobiKernelNode1Params
.
sharedMemBytes
=
0
;
jacobiKernelNode1Params
.
sharedMemBytes
=
0
;
jacobiKernelNode1Params
.
kernelParams
=
kernelArgsNode1
;
jacobiKernelNode1Params
.
kernelParams
=
kernelArgsNode1
;
jacobiKernelNode1Params
.
extra
=
NULL
;
jacobiKernelNode1Params
.
extra
=
NULL
;
for
(
std
::
size_t
i
=
0
;
i
<
8
;
i
++
)
{
fprintf
(
stderr
,
"Kernel Arg %d: %p
\n
"
,
i
,
kernelArgsNode1
[
i
]);
}
CUDA_RT_CALL
(
cudaGraphAddKernelNode
(
&
jacobiKernelNode1
,
graphs
[
calculate_norm
][
is_even
],
nodeDependencies
.
data
(),
CUDA_RT_CALL
(
cudaGraphAddKernelNode
(
&
jacobiKernelNode1
,
graphs
[
calculate_norm
][
is_even
],
nodeDependencies
.
data
(),
nodeDependencies
.
size
(),
&
jacobiKernelNode1Params
));
nodeDependencies
.
size
(),
&
jacobiKernelNode1Params
));
void
*
kernelArgsNode2
[
7
]
=
{(
void
*
)
a_new
,
(
void
*
)
a
,
(
void
*
)
l2_norm_d
,
(
void
*
)
&
new_iy_end_neigh
,
(
void
*
)
&
new_iy_end
,
(
void
*
)
&
new_nx
,
(
void
*
)
&
new_calculate_norm
};
void
*
kernelArgsNode2
[
7
]
=
{
(
void
*
)
a_new
,
(
void
*
)
a
,
(
void
*
)
l2_norm_d
,
(
void
*
)
&
new_iy_end_neigh
,
(
void
*
)
&
new_iy_end
,
(
void
*
)
&
new_nx
,
(
void
*
)
&
new_calculate_norm
};
jacobiKernelNode2Params
.
func
=
(
void
*
)
jacobi_kernel_proxy
;
jacobiKernelNode2Params
.
func
=
(
void
*
)
jacobi_kernel_proxy
;
jacobiKernelNode2Params
.
blockDim
=
dim3
(
dim_block_x
,
dim_block_y
,
1
);
jacobiKernelNode2Params
.
blockDim
=
dim3
(
dim_block_x
,
dim_block_y
,
1
);
jacobiKernelNode2Params
.
gridDim
=
dim3
((
nx
+
dim_block_x
-
1
)
/
dim_block_x
,((
iy_end
-
iy_start
)
+
dim_block_y
-
1
)
/
dim_block_y
,
1
);
jacobiKernelNode2Params
.
gridDim
=
dim3
((
nx
+
dim_block_x
-
1
)
/
dim_block_x
,((
iy_end
-
iy_start
)
+
dim_block_y
-
1
)
/
dim_block_y
,
1
);
...
@@ -471,8 +485,12 @@ int main(int argc, char* argv[]) {
...
@@ -471,8 +485,12 @@ int main(int argc, char* argv[]) {
nodeDependencies
.
push_back
(
jacobiMemsetNode
);
nodeDependencies
.
push_back
(
jacobiMemsetNode
);
nodeDependencies
.
push_back
(
ncclNode
);
nodeDependencies
.
push_back
(
ncclNode
);
void
*
kernelArgsNode3
[
7
]
=
{(
void
*
)
&
a_new
,
(
void
*
)
&
a
,(
void
*
)
&
l2_norm_d
,
(
void
*
)
&
new_iy_start_neigh
,(
void
*
)
&
new_iy_end_neigh
,
void
*
kernelArgsNode3
[
7
]
=
{
(
void
*
)
&
new_nx
,(
void
*
)
&
new_calculate_norm
};
(
void
*
)
&
a_new
,
(
void
*
)
&
a
,
(
void
*
)
&
l2_norm_d
,
(
void
*
)
&
new_iy_start_neigh
,
(
void
*
)
&
new_iy_end_neigh
,(
void
*
)
&
new_nx
,
(
void
*
)
&
new_calculate_norm
};
jacobiKernelNode3Params
.
func
=
(
void
*
)
jacobi_kernel_proxy
;
jacobiKernelNode3Params
.
func
=
(
void
*
)
jacobi_kernel_proxy
;
jacobiKernelNode3Params
.
blockDim
=
dim3
(
dim_block_x
,
dim_block_y
,
1
);
jacobiKernelNode3Params
.
blockDim
=
dim3
(
dim_block_x
,
dim_block_y
,
1
);
jacobiKernelNode3Params
.
gridDim
=
dim3
((
nx
+
dim_block_x
-
1
)
/
dim_block_x
,((
iy_end
-
iy_start
)
+
dim_block_y
-
1
)
/
dim_block_y
,
1
);
jacobiKernelNode3Params
.
gridDim
=
dim3
((
nx
+
dim_block_x
-
1
)
/
dim_block_x
,((
iy_end
-
iy_start
)
+
dim_block_y
-
1
)
/
dim_block_y
,
1
);
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment