Skip to content
Snippets Groups Projects
Commit fb17d31b authored by mathias wagner's avatar mathias wagner
Browse files

master files should be good now

parent 04ebbf3a
Branches
No related tags found
No related merge requests found
Showing
with 1553 additions and 542 deletions
File added
File added
"Device","Kernel","Invocations","Metric Name","Metric Description","Min","Max","Avg"
"Tesla V100-SXM2-16GB (0)","main_98_gpu",3,"gld_efficiency","Global Memory Load Efficiency",90.885714%,91.040204%,90.938132%
"Tesla V100-SXM2-16GB (0)","main_98_gpu",3,"gst_efficiency","Global Memory Store Efficiency",88.956522%,88.956522%,88.956522%
"Tesla V100-SXM2-16GB (0)","main_106_gpu",3,"gld_efficiency","Global Memory Load Efficiency",94.722222%,94.722222%,94.722222%
"Tesla V100-SXM2-16GB (0)","main_106_gpu",3,"gst_efficiency","Global Memory Store Efficiency",88.956522%,88.956522%,88.956522%
"Tesla V100-SXM2-16GB (0)","main_94_gpu__red",3,"gld_efficiency","Global Memory Load Efficiency",99.756335%,99.756335%,99.756335%
"Tesla V100-SXM2-16GB (0)","main_94_gpu__red",3,"gst_efficiency","Global Memory Store Efficiency",25.000000%,25.000000%,25.000000%
"Tesla V100-SXM2-16GB (0)","main_66_gpu",1,"gld_efficiency","Global Memory Load Efficiency",0.000000%,0.000000%,0.000000%
"Tesla V100-SXM2-16GB (0)","main_66_gpu",1,"gst_efficiency","Global Memory Store Efficiency",100.000000%,100.000000%,100.000000%
"Tesla V100-SXM2-16GB (0)","main_88_gpu",3,"gld_efficiency","Global Memory Load Efficiency",91.836772%,91.874827%,91.856345%
"Tesla V100-SXM2-16GB (0)","main_88_gpu",3,"gst_efficiency","Global Memory Store Efficiency",88.845486%,88.845486%,88.845486%
"Tesla V100-SXM2-16GB (0)","main_111_gpu",3,"gld_efficiency","Global Memory Load Efficiency",25.000000%,25.000000%,25.000000%
"Tesla V100-SXM2-16GB (0)","main_111_gpu",3,"gst_efficiency","Global Memory Store Efficiency",25.000000%,25.000000%,25.000000%
File added
File added
File added
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <math.h>
#include <mpi.h>
#include <openacc.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "common.h"
// TODO: Inlucde necessary headers for NVSHMEM
// Helper function to map existing device allocation to host allocation for NVSHMEM
void map(real* restrict harr, real* restrict darr, int size) { acc_map_data(harr, darr, size); }
int main(int argc, char** argv) {
int ny = 4096;
int nx = 4096;
int iter_max = 1000;
const real tol = 1.0e-5;
if (argc == 2) {
iter_max = atoi(argv[1]);
}
int rank = 0;
int size = 1;
// Initialize MPI and determine rank and size
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
// TODO: Initialize NVSHMEM with MPI using nvshmemx_init_attr
#pragma acc set device_num(rank)
real* restrict const A = (real*)malloc(nx * ny * sizeof(real));
real* restrict const Aref = (real*)malloc(nx * ny * sizeof(real));
real* restrict const Anew = (real*)malloc(nx * ny * sizeof(real));
real* restrict const rhs = (real*)malloc(nx * ny * sizeof(real));
// TODO: Allocate symmetric device memory for A
// real *d_A = ...
// TODO: For OpenACC we need to map it to A and Anew so that OpenACC knows we already allocated
// device memory for A and Anew
// You can use the helper function map(...) above or us acc_map_data directly
// set rhs
for (int iy = 1; iy < ny - 1; iy++) {
for (int ix = 1; ix < nx - 1; ix++) {
const real x = -1.0 + (2.0 * ix / (nx - 1));
const real y = -1.0 + (2.0 * iy / (ny - 1));
rhs[iy * nx + ix] = expr(-10.0 * (x * x + y * y));
}
}
#pragma acc enter data create(A [0:nx * ny], Aref [0:nx * ny], Anew [0:nx * ny], rhs [0:nx * ny])
int ix_start = 1;
int ix_end = (nx - 1);
// Ensure correctness if ny%size != 0
int chunk_size = ceil((1.0 * ny) / size);
int iy_start = rank * chunk_size;
int iy_end = iy_start + chunk_size;
// Do not process boundaries
iy_start = max(iy_start, 1);
iy_end = min(iy_end, ny - 1);
// OpenACC Warm-up
#pragma acc parallel loop present(A, Aref)
for (int iy = 0; iy < ny; iy++) {
for (int ix = 0; ix < nx; ix++) {
Aref[iy * nx + ix] = 0.0;
A[iy * nx + ix] = 0.0;
}
}
// TODO: Warming up MPI / CUDA IPC is not needed with NVSHMEM - remove that part
// MPI Warm-up to establish CUDA IPC connections
for (int i = 0; i < 2; ++i) {
int top = (rank == 0) ? (size - 1) : rank - 1;
int bottom = (rank == (size - 1)) ? 0 : rank + 1;
#pragma acc host_data use_device(A)
{
// 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
// bottom
MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
// 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
// (iy_start-1) from top
MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
}
// Wait for all processes to finish Warm-up
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", ny, nx);
double runtime_serial = 0.0;
if (rank == 0) {
printf("Calculate reference solution and time serial execution.\n");
// Timing of MPI rank 0 is used to calculate speedup do this in isolation
double start = MPI_Wtime();
poisson2d_serial(rank, iter_max, tol, Aref, Anew, nx, ny, rhs);
runtime_serial = MPI_Wtime() - start;
}
MPI_Bcast(Aref, nx * ny, MPI_REAL_TYPE, 0, MPI_COMM_WORLD);
// Wait for all processes to ensure correct timing of the parallel version
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0) printf("Parallel execution.\n");
double mpi_time = 0.0;
double start = MPI_Wtime();
int iter = 0;
real error = 1.0;
#pragma acc update device(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx], \
rhs [iy_start * nx:(iy_end - iy_start) * nx])
while (error > tol && iter < iter_max) {
error = 0.0;
#pragma acc parallel loop present(A, Anew, rhs)
for (int iy = iy_start; iy < iy_end; iy++) {
for (int ix = ix_start; ix < ix_end; ix++) {
Anew[iy * nx + ix] =
-0.25 * (rhs[iy * nx + ix] - (A[iy * nx + ix + 1] + A[iy * nx + ix - 1] +
A[(iy - 1) * nx + ix] + A[(iy + 1) * nx + ix]));
error = fmaxr(error, fabsr(Anew[iy * nx + ix] - A[iy * nx + ix]));
}
}
real globalerror = 0.0;
MPI_Allreduce(&error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD);
error = globalerror;
#pragma acc parallel loop present(A, Anew)
for (int iy = iy_start; iy < iy_end; iy++) {
for (int ix = ix_start; ix < ix_end; ix++) {
A[iy * nx + ix] = Anew[iy * nx + ix];
}
}
// Periodic boundary conditions
int top = (rank == 0) ? (size - 1) : rank - 1;
int bottom = (rank == (size - 1)) ? 0 : rank + 1;
#pragma acc host_data use_device(A)
{
double start_mpi = MPI_Wtime();
// TODO: Replace both MPI calls with ons-sided nvshmem_<type>_put
// make sure to put data in the right location on the remote side
// 1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from
// bottom
MPI_Sendrecv(A + iy_start * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, top, 0,
A + iy_end * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE, bottom, 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
// 2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary
// (iy_start-1) from top
MPI_Sendrecv(A + (iy_end - 1) * nx + ix_start, (ix_end - ix_start), MPI_REAL_TYPE,
bottom, 0, A + (iy_start - 1) * nx + ix_start, (ix_end - ix_start),
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
// TODO: Add a barrier to make sure data had arrived from remote
mpi_time += MPI_Wtime() - start_mpi;
}
#pragma acc parallel loop present(A)
for (int iy = iy_start; iy < iy_end; iy++) {
A[iy * nx + 0] = A[iy * nx + (nx - 2)];
A[iy * nx + (nx - 1)] = A[iy * nx + 1];
}
if (rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
iter++;
}
#pragma acc update self(A [(iy_start - 1) * nx:((iy_end - iy_start) + 2) * nx])
MPI_Barrier(MPI_COMM_WORLD);
double runtime = MPI_Wtime() - start;
int errors = 0;
if (check_results(rank, ix_start, ix_end, iy_start, iy_end, tol, A, Aref, nx)) {
if (rank == 0) {
printf("Num GPUs: %d.\n", size);
printf("%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n",
ny, nx, runtime_serial, size, runtime, runtime_serial / runtime,
runtime_serial / (size * runtime) * 100);
printf(
"MPI time: %8.4f s, inter GPU BW: %8.2f GiB/s\n", mpi_time,
(iter * 4 * (ix_end - ix_start) * sizeof(real)) / (1024 * 1024 * 1024 * mpi_time));
}
} else {
errors = -1;
}
#pragma acc exit data delete (A, Aref, Anew, rhs)
MPI_Finalize();
free(rhs);
free(Anew);
free(Aref);
free(A);
// TODO: free shmem memory
return errors;
}
File added
File added
File added
File added
File added
File added
This diff is collapsed.
4-GPU/HandsOn/.master/resources/Solution3.png

629 KiB

4-GPU/HandsOn/.master/resources/Solution4.png

641 KiB

4-GPU/HandsOn/.master/resources/Solution5.png

570 KiB

4-GPU/HandsOn/.master/resources/Solution6.png

567 KiB

4-GPU/HandsOn/.master/resources/eval.png

621 B

---
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -1
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: Yes
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeInheritanceComma: false
BreakInheritanceList: BeforeColon
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 100
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: true
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IncludeBlocks: Preserve
IncludeCategories:
- Regex: '^<ext/.*\.h>'
Priority: 2
- Regex: '^<.*\.h>'
Priority: 1
- Regex: '^<.*'
Priority: 2
- Regex: '.*'
Priority: 3
IncludeIsMainRegex: '([-_](test|unittest))?$'
IndentCaseLabels: true
IndentPPDirectives: None
IndentWidth: 4
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Never
ObjCBlockIndentWidth: 4
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
RawStringFormats:
- Language: Cpp
Delimiters:
- cc
- CC
- cpp
- Cpp
- CPP
- 'c++'
- 'C++'
CanonicalDelimiter: ''
BasedOnStyle: google
- Language: TextProto
Delimiters:
- pb
- PB
- proto
- PROTO
EnclosingFunctions:
- EqualsProto
- EquivToProto
- PARSE_PARTIAL_TEXT_PROTO
- PARSE_TEST_PROTO
- PARSE_TEXT_PROTO
- ParseTextOrDie
- ParseTextProtoOrDie
CanonicalDelimiter: ''
BasedOnStyle: google
ReflowComments: true
SortIncludes: true
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Auto
TabWidth: 8
UseTab: Never
...
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment