Skip to content
Snippets Groups Projects
Commit 910157a9 authored by Sandipan Mohanty's avatar Sandipan Mohanty
Browse files

Fix up day 4 examples

parent 497b2451
Branches
Tags
No related merge requests found
Compile with nvcc
nvcc -std=c++14 --expt-extended-lambda prog.cu -o prog.ex
nvcc -arch=sm_70 -O3 --extended-lambda prog.cu -o prog.ex
Compile with clang++
clang++ -std=c++14 prog.cu -o prog.ex -stdlib=libc++ --cuda-gpu-arch=XXXXX -I CUDAPATH/include -L CUDAPATH/lib64 -lcudart_static -ldl -lrt -lpthread
clang++ -std=c++17 -O3 -stdlib=libc++ --cuda-gpu-arch=sm_70 -I $(which nvcc)/../include -L $(which nvcc)/../lib64 -lcudart_static -ldl -lrt -lpthread -o prog.ex prog.cc
To use CUDA BLAS library, use "-lcublas".
To use CUDA Random number library, use "-lcurand"
Compile with nvc++ and stdpar
nvc++ -std=c++20 -O3 -stdpar -gpu=cc70 prog.cc -o prog.ex
#include <random>
#include <iostream>
#include <vector>
#include <algorithm>
#include <ranges>
#include <execution>
#include <limits>
#include <atomic>
#include "CountingIterator.hh"
#include <numeric>
#include <thrust/random.h>
namespace sr = std::ranges;
namespace sv = std::views;
template <class T>
//using VectorType = std::vector<T, tbb::scalable_allocator<T>>;
using VectorType = std::vector<T>;
auto sample_group(size_t n, size_t offset) -> VectorType<int>
{
VectorType<int> grp(n, 0);
thrust::default_random_engine eng{ offset };
thrust::uniform_int_distribution<> dist{ 0, 365 };
auto birthdays = [&]{ return dist(eng); };
std::generate(grp.begin(), grp.end(), birthdays);
return grp;
}
auto probability_for_equal_birthdays(size_t group_size,
size_t nexpt = 10'000'000UL) -> double
{
static std::mt19937_64 offset{ std::random_device{}() };
// transform_reduce(start, end, init, accumulator_op, transform_op);
// That's the normal syntax of transform_reduce. But, there is another
// overload that takes one extra argument at the front: an execution policy.
// Add one extra argument to transform_reduce: std::execution::par which
// specifies a parallel execution policy, and see what happens!
auto count_begin = offset();
auto nclashes = std::transform_reduce(algo_counter(count_begin),
algo_counter(count_begin + nexpt), 0UL,
std::plus<size_t>{},
[&](auto counter) {
auto group = sample_group(group_size, counter);
sr::sort(group);
auto newend = std::unique(group.begin(), group.end());
//group.erase(newend, group.end());
//if (group.size() != group_size) ++nclashes;
if (newend != group.end()) return 1UL;
return 0UL;
});
return static_cast<double>(nclashes) / nexpt;
}
auto main(int argc, char* argv[]) -> int
{
auto target_group_size = (argc == 1 ? 50UL : std::stoul(argv[1]));
std::cout << "Group size\tShared birthday probability\n\n";
for (auto gs = 0UL; gs < target_group_size; ++gs) {
std::cout << gs << "\t\t" << probability_for_equal_birthdays(gs) << "\n";
}
}
NVCC=nvc++
NVCC_FLAGS=-std=c++17 -O3 -stdpar
NVCC_FLAGS=-O3 -stdpar -gpu=cc70
.PHONY: all
all: jacobi_cxx17
all: jacobi_cxx17 jacobi_cxx20
jacobi_cxx17: jacobi_cxx17.cc Makefile
$(NVCC) $(CFLAGS) $(NVCC_FLAGS) $< $(LIBS) -o $@
jacobi_cxx17: jacobi_cxx17.cc
$(NVCC) $(CFLAGS) $(NVCC_FLAGS) -std=c++17 $< $(LIBS) -o $@
jacobi_cxx20: jacobi_cxx20.cc
$(NVCC) $(CFLAGS) $(NVCC_FLAGS) -std=c++20 $< $(LIBS) -o $@
.PHONY: clean
clean:
rm -f jacobi_cxx17
rm -f jacobi_cxx17 jacobi_cxx20
#include "CountingIterator.hh"
#include <algorithm>
#include <chrono>
#include <complex>
#include <execution>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include <chrono>
void save_pgm(std::string filename, size_t width, size_t height,
std::vector<unsigned char> data)
......
#include "CountingIterator.hh"
#include "pngwriter.h"
#include <algorithm>
#include <chrono>
#include <complex>
#include <execution>
#include <fstream>
#include <iostream>
#include <algorithm>
#include <execution>
#include <string>
#include <vector>
......
#include <chrono>
#include <iostream>
#include <string>
#include <chrono>
__global__ void mul(const double* A, const double* B, double* C, size_t N)
{
......@@ -8,7 +8,8 @@ __global__ void mul(const double *A, const double *B, double *C, size_t N)
auto j = threadIdx.y + blockIdx.y * blockDim.y;
double res {};
for (size_t k = 0ul; k < N; ++k) {
if (i<N && j<N) res += A[N*i + k] * B[N*k +j];
if (i < N && j < N)
res += A[N * i + k] * B[N * k + j];
}
C[N * i + j] = res;
}
......@@ -33,4 +34,3 @@ int main(int argc, char *argv[])
auto dt = std::chrono::duration<double>(t1 - t0).count();
std::cout << (2.0 * N - 1.) * N * N * 1.0e-9 / dt << " GFlops\n";
}
#include <chrono>
#include <iostream>
#include <string>
#include <chrono>
__global__ void mul(const double* A, const double* B, double* C, size_t N)
{
......@@ -34,4 +34,3 @@ int main(int argc, char *argv[])
auto dt = std::chrono::duration<double>(t1 - t0).count();
std::cout << (2.0 * N - 1.) * N * N * 1.0e-9 / dt << " GFlops\n";
}
#include <chrono>
#include <iostream>
#include <string>
#include <optional>
__global__ void mul(const double* A, const double* B, double* C, size_t N)
{
auto i = threadIdx.x + blockIdx.x * blockDim.x;
auto j = threadIdx.y + blockIdx.y * blockDim.y;
double res {};
if (i < N && j < N)
for (size_t k = 0ul; k < N; ++k) {
res += A[N * i + k] * B[N * k + j];
}
C[N * i + j] = res;
}
template <class T>
auto malloc_usm(size_t N, std::optional<T> init = std::nullopt) -> T*
{
T* ans{};
cudaMallocManaged(&ans, N * sizeof(T));
if (init) { for (size_t i = 0UL; i < N; ++i) ans[i] = *init; }
return ans;
}
int main(int argc, char* argv[])
{
const unsigned N = (argc > 1) ? std::stoul(argv[1]) : 2048u;
auto a = malloc_usm<double>(N * N);
auto b = malloc_usm<double>(N * N);
auto c = malloc_usm<double>(N * N);
for (size_t i = 0ul; i < N * N; ++i) {
a[i] = 1.1;
b[i] = 0.89;
}
auto t0 = std::chrono::high_resolution_clock::now();
dim3 ThreadsPerBlock { 16, 16 };
dim3 NumBlocks { N / ThreadsPerBlock.x, N / ThreadsPerBlock.y };
mul<<<NumBlocks, ThreadsPerBlock>>>(a, b, c, N);
cudaDeviceSynchronize();
auto t1 = std::chrono::high_resolution_clock::now();
auto dt = std::chrono::duration<double>(t1 - t0).count();
std::cout << (2.0 * N - 1.) * N * N * 1.0e-9 / dt << " GFlops\n";
cudaFree(c);
cudaFree(b);
cudaFree(a);
}
......@@ -8,7 +8,7 @@
using namespace thrust::placeholders;
int main()
auto main() -> int
{
std::random_device seed;
std::mt19937_64 engine{seed()};
......
......@@ -15,7 +15,7 @@ auto main() -> int
std::generate(vals.begin(), vals.end(), [=]()mutable { return dist(engine); });
std::sort(std::execution::par, vals.begin(), vals.end());
std::sort(std::execution::par_unseq, vals.begin(), vals.end());
std::cout << "Middle element after sort = " << vals[N/2] << "\n";
}
......@@ -5,9 +5,10 @@
#include <algorithm>
#include <string>
#include <execution>
#include <span>
constexpr auto Lambda = 0.75;
constexpr auto L2 = Lambda * Lambda;
//constexpr auto L2 = Lambda * Lambda;
constexpr auto cut = 4.3;
constexpr auto cut2 = cut * cut;
constexpr auto icut2 = 1.0 / cut2;
......@@ -36,11 +37,20 @@ auto Vexv(T r2, T sigsa12) -> T
auto exvolcalc(const vector_type<double>& R2, const vector_type<double>& S12) -> double
{
return
std::transform_reduce(std::execution::par, R2.begin(), R2.end(), S12.begin(),
std::transform_reduce(std::execution::par_unseq, R2.begin(), R2.end(), S12.begin(),
0., std::plus<double>{}, [](auto r2, auto s12){
return Vexv(r2, s12);
});
}
auto exvolcalc2(std::span<const double> R2, std::span<const double> S12) -> double
{
return
std::transform_reduce(std::execution::par_unseq, R2.begin(), R2.end(), S12.begin(),
0., std::plus<double>{}, [](auto r2, auto s12){
return Vexv(r2, s12);
});
}
struct options {
void usage(std::string prog)
......@@ -93,7 +103,7 @@ auto main(int argc, char* argv[]) -> int
double tottime = 0., totval =0.;
for (size_t j = 0; j < nrep; ++j) {
auto t0 = std::chrono::high_resolution_clock::now();
double tot = exvolcalc(R2, S12);
double tot = exvolcalc2(R2, S12);
auto t1 = std::chrono::high_resolution_clock::now();
auto timetaken = std::chrono::duration<double>(t1-t0).count();
tottime += timetaken;
......
#include <thrust/host_vector.h>
#include <cstdlib>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <cstdlib>
int main()
auto main() -> int
{
// generate 32 M random numbers on the host
thrust::host_vector<int> h_vec(1 << 22);
......@@ -28,4 +28,3 @@ int main()
thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
std::cout << "Done.\n";
}
#include <thrust/host_vector.h>
#include <cstdlib>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <cstdlib>
int main()
auto main() -> int
{
// generate 32 M random numbers on the host
thrust::host_vector<int> h_vec(1 << 16);
......@@ -24,4 +24,3 @@ int main()
int x = thrust::reduce(d_vec.begin(), d_vec.end(), 0, thrust::plus<int>());
std::cout << "Done. Result = " << x << "\n";
}
#include <cstdlib>
#include <thrust/copy.h>
#include <thrust/generate.h>
#include <thrust/universal_vector.h>
#include <thrust/sort.h>
auto main() -> int
{
thrust::universal_vector<int> h_vec(1 << 22);
std::cout << "Filling host vector with random numbers\n";
thrust::generate(thrust::host, h_vec.begin(), h_vec.end(), rand);
std::cout << "Done.\n";
std::cout << "Sorting vector on device\n";
thrust::sort(thrust::device, h_vec.begin(), h_vec.end());
std::cout << "Done.\n";
std::cout << h_vec[1121] << "\n";
}
#include <chrono>
#include <random>
#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/transform.h>
#include <thrust/count.h>
#include <thrust/functional.h>
#include <random>
#include <chrono>
using namespace thrust::placeholders;
int main()
auto main() -> int
{
std::random_device seed;
std::mt19937_64 engine { seed() };
......@@ -18,12 +18,11 @@ int main()
thrust::generate(X.begin(), X.end(), gen);
thrust::device_vector<double> dX = X;
// auto norm = thrust::transform_reduce(dX.begin(), dX.end(), (_1 * _1), 0., (_1 + _2));
auto norm = thrust::transform_reduce(dX.begin(), dX.end(),
auto norm = thrust::transform_reduce(
dX.begin(), dX.end(),
[] __device__(auto a) { return a * a; },
0.,
[] __device__(auto a, auto b) { return a + b; });
std::cout << "Transform reduce: norm = " << norm << "\n";
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment