Fix up day 4 examples

910157a9 · Sandipan Mohanty · 497b2451 · 910157a9 · 910157a9 · 910157a9
Commit 910157a9 authored Jun 10, 2022 by Sandipan Mohanty
--- a/day4/examples/README
+++ b/day4/examples/README
 Compile with nvcc

-nvcc -std=c++14 --expt-extended-lambda prog.cu -o prog.ex
+nvcc -arch=sm_70 -O3 --extended-lambda prog.cu -o prog.ex


 Compile with clang++
-
-clang++ -std=c++14 prog.cu -o prog.ex -stdlib=libc++ --cuda-gpu-arch=XXXXX  -I CUDAPATH/include -L CUDAPATH/lib64 -lcudart_static -ldl -lrt -lpthread
+clang++ -std=c++17 -O3 -stdlib=libc++ --cuda-gpu-arch=sm_70  -I $(which nvcc)/../include -L $(which nvcc)/../lib64 -lcudart_static -ldl -lrt -lpthread -o prog.ex prog.cc

 To use CUDA BLAS library, use "-lcublas".
 To use CUDA Random number library, use "-lcurand"
 
+Compile with nvc++ and stdpar
+
+nvc++ -std=c++20 -O3 -stdpar -gpu=cc70 prog.cc -o prog.ex
+
+
--- a/day4/examples/birthday_problem.cc
+++ b/day4/examples/birthday_problem.cc
+#include <random>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <ranges>
+#include <execution>
+#include <limits>
+#include <atomic>
+#include "CountingIterator.hh"
+#include <numeric>
+#include <thrust/random.h>
+
+namespace sr = std::ranges;
+namespace sv = std::views;
+
+template <class T>
+//using VectorType = std::vector<T, tbb::scalable_allocator<T>>;
+using VectorType = std::vector<T>;
+
+auto sample_group(size_t n, size_t offset) -> VectorType<int>
+{
+    VectorType<int> grp(n, 0);
+    thrust::default_random_engine eng{ offset };
+    thrust::uniform_int_distribution<> dist{ 0, 365 };
+    auto birthdays = [&]{ return dist(eng); };
+    std::generate(grp.begin(), grp.end(), birthdays);
+    return grp;
+}
+
+auto probability_for_equal_birthdays(size_t group_size, 
+		size_t nexpt = 10'000'000UL) -> double
+{
+    static std::mt19937_64 offset{ std::random_device{}() };
+    // transform_reduce(start, end, init, accumulator_op, transform_op);
+    // That's the normal syntax of transform_reduce. But, there is another
+    // overload that takes one extra argument at the front: an execution policy.
+    // Add one extra argument to transform_reduce: std::execution::par which
+    // specifies a parallel execution policy, and see what happens!
+    auto count_begin = offset();
+    auto nclashes = std::transform_reduce(algo_counter(count_begin), 
+		    algo_counter(count_begin + nexpt), 0UL, 
+            std::plus<size_t>{},
+            [&](auto counter) {
+        auto group = sample_group(group_size, counter);
+        sr::sort(group);
+        auto newend = std::unique(group.begin(), group.end());
+        //group.erase(newend, group.end());
+        //if (group.size() != group_size) ++nclashes;
+        if (newend != group.end()) return 1UL;
+        return 0UL;
+    });
+    return static_cast<double>(nclashes) / nexpt;
+}
+
+auto main(int argc, char* argv[]) -> int
+{
+    auto target_group_size = (argc == 1 ? 50UL : std::stoul(argv[1]));
+    std::cout << "Group size\tShared birthday probability\n\n";
+    for (auto gs = 0UL; gs < target_group_size; ++gs) {
+        std::cout << gs << "\t\t" << probability_for_equal_birthdays(gs) << "\n";
+    }
+}
+
--- a/day4/examples/jacobi/Makefile
+++ b/day4/examples/jacobi/Makefile
 NVCC=nvc++
-NVCC_FLAGS=-std=c++17 -O3 -stdpar
+NVCC_FLAGS=-O3 -stdpar -gpu=cc70

 .PHONY: all

-all: jacobi_cxx17
+all: jacobi_cxx17 jacobi_cxx20

-jacobi_cxx17: jacobi_cxx17.cc Makefile
-	$(NVCC) $(CFLAGS) $(NVCC_FLAGS) $< $(LIBS) -o $@
+jacobi_cxx17: jacobi_cxx17.cc
+	$(NVCC) $(CFLAGS) $(NVCC_FLAGS) -std=c++17 $< $(LIBS) -o $@
+jacobi_cxx20: jacobi_cxx20.cc
+	$(NVCC) $(CFLAGS) $(NVCC_FLAGS) -std=c++20 $< $(LIBS) -o $@


 .PHONY: clean

 clean:
-	rm -f jacobi_cxx17
+	rm -f jacobi_cxx17 jacobi_cxx20


--- a/day4/examples/mandelbrot1.cc
+++ b/day4/examples/mandelbrot1.cc
 #include "CountingIterator.hh"
 #include <algorithm>
+#include <chrono>
 #include <complex>
 #include <execution>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
-#include <chrono>

 void save_pgm(std::string filename, size_t width, size_t height,
    std::vector<unsigned char> data)

--- a/day4/examples/mandelbrot_cpu.cc
+++ b/day4/examples/mandelbrot_cpu.cc
 #include "CountingIterator.hh"
 #include "pngwriter.h"
+#include <algorithm>
 #include <chrono>
 #include <complex>
+#include <execution>
 #include <fstream>
 #include <iostream>
-#include <algorithm>
-#include <execution>
 #include <string>
 #include <vector>


--- a/day4/examples/matmul_trivial.cu
+++ b/day4/examples/matmul_trivial.cu
+#include <chrono>
 #include <iostream>
 #include <string>
-#include <chrono>

 __global__ void mul(const double* A, const double* B, double* C, size_t N)
 {
@@ -8,7 +8,8 @@ __global__ void mul(const double *A, const double *B, double *C, size_t N)
    auto j = threadIdx.y + blockIdx.y * blockDim.y;
    double res {};
    for (size_t k = 0ul; k < N; ++k) {
-        if (i<N && j<N) res += A[N*i + k] * B[N*k +j];
+        if (i < N && j < N)
+            res += A[N * i + k] * B[N * k + j];
    }
    C[N * i + j] = res;
 }
@@ -33,4 +34,3 @@ int main(int argc, char *argv[])
    auto dt = std::chrono::duration<double>(t1 - t0).count();
    std::cout << (2.0 * N - 1.) * N * N * 1.0e-9 / dt << " GFlops\n";
 }
-
--- a/day4/examples/matmul_trivial2.cu
+++ b/day4/examples/matmul_trivial2.cu
+#include <chrono>
 #include <iostream>
 #include <string>
-#include <chrono>

 __global__ void mul(const double* A, const double* B, double* C, size_t N)
 {
@@ -34,4 +34,3 @@ int main(int argc, char *argv[])
    auto dt = std::chrono::duration<double>(t1 - t0).count();
    std::cout << (2.0 * N - 1.) * N * N * 1.0e-9 / dt << " GFlops\n";
 }
-
--- a/day4/examples/matmul_trivial3.cu
+++ b/day4/examples/matmul_trivial3.cu
+#include <chrono>
+#include <iostream>
+#include <string>
+#include <optional>
+
+__global__ void mul(const double* A, const double* B, double* C, size_t N)
+{
+    auto i = threadIdx.x + blockIdx.x * blockDim.x;
+    auto j = threadIdx.y + blockIdx.y * blockDim.y;
+    double res {};
+    if (i < N && j < N)
+        for (size_t k = 0ul; k < N; ++k) {
+            res += A[N * i + k] * B[N * k + j];
+        }
+    C[N * i + j] = res;
+}
+
+template <class T>
+auto malloc_usm(size_t N, std::optional<T> init = std::nullopt) -> T*
+{
+    T* ans{};
+    cudaMallocManaged(&ans, N * sizeof(T));
+    if (init) { for (size_t i = 0UL; i < N; ++i) ans[i] = *init; }
+    return ans;
+}
+int main(int argc, char* argv[])
+{
+    const unsigned N = (argc > 1) ? std::stoul(argv[1]) : 2048u;
+    auto a = malloc_usm<double>(N * N);
+    auto b = malloc_usm<double>(N * N);
+    auto c = malloc_usm<double>(N * N);
+    for (size_t i = 0ul; i < N * N; ++i) {
+        a[i] = 1.1;
+        b[i] = 0.89;
+    }
+    auto t0 = std::chrono::high_resolution_clock::now();
+    dim3 ThreadsPerBlock { 16, 16 };
+    dim3 NumBlocks { N / ThreadsPerBlock.x, N / ThreadsPerBlock.y };
+    mul<<<NumBlocks, ThreadsPerBlock>>>(a, b, c, N);
+    cudaDeviceSynchronize();
+    auto t1 = std::chrono::high_resolution_clock::now();
+    auto dt = std::chrono::duration<double>(t1 - t0).count();
+    std::cout << (2.0 * N - 1.) * N * N * 1.0e-9 / dt << " GFlops\n";
+    cudaFree(c);
+    cudaFree(b);
+    cudaFree(a);
+}
--- a/day4/examples/saxpy0.cu
+++ b/day4/examples/saxpy0.cu
@@ -8,7 +8,7 @@

 using namespace thrust::placeholders;

-int main()
+auto main() -> int
 {
    std::random_device seed;
    std::mt19937_64 engine{seed()};

--- a/day4/examples/stdpardemo0.cc
+++ b/day4/examples/stdpardemo0.cc
@@ -15,7 +15,7 @@ auto main() -> int

    std::generate(vals.begin(), vals.end(), [=]()mutable { return dist(engine); });

-    std::sort(std::execution::par, vals.begin(), vals.end());
+    std::sort(std::execution::par_unseq, vals.begin(), vals.end());
    std::cout << "Middle element after sort = " << vals[N/2] << "\n";
 }

--- a/day4/examples/stdpardemo1.cc
+++ b/day4/examples/stdpardemo1.cc
@@ -5,9 +5,10 @@
 #include <algorithm>
 #include <string>
 #include <execution>
+#include <span>

 constexpr auto Lambda = 0.75;
-constexpr auto L2 = Lambda * Lambda;
+//constexpr auto L2 = Lambda * Lambda;
 constexpr auto cut = 4.3;
 constexpr auto cut2 = cut * cut;
 constexpr auto icut2 = 1.0 / cut2;
@@ -36,11 +37,20 @@ auto Vexv(T r2, T sigsa12) -> T
 auto exvolcalc(const vector_type<double>& R2, const vector_type<double>& S12) -> double
 {
    return 
-    std::transform_reduce(std::execution::par, R2.begin(), R2.end(), S12.begin(), 
+    std::transform_reduce(std::execution::par_unseq, R2.begin(), R2.end(), S12.begin(), 
        0., std::plus<double>{}, [](auto r2, auto s12){
        return Vexv(r2, s12);
    });
 }
+auto exvolcalc2(std::span<const double> R2, std::span<const double> S12) -> double
+{
+    return 
+    std::transform_reduce(std::execution::par_unseq, R2.begin(), R2.end(), S12.begin(), 
+        0., std::plus<double>{}, [](auto r2, auto s12){
+        return Vexv(r2, s12);
+    });
+}
+

 struct options {
    void usage(std::string prog)
@@ -93,7 +103,7 @@ auto main(int argc, char* argv[]) -> int
        double tottime = 0., totval =0.;
        for (size_t j = 0; j < nrep; ++j) {
            auto t0 = std::chrono::high_resolution_clock::now();
-            double tot = exvolcalc(R2, S12);
+            double tot = exvolcalc2(R2, S12);
            auto t1 = std::chrono::high_resolution_clock::now();
            auto timetaken = std::chrono::duration<double>(t1-t0).count();
            tottime += timetaken;

--- a/day4/examples/thrust0.cu
+++ b/day4/examples/thrust0.cu
-#include <thrust/host_vector.h>
+#include <cstdlib>
+#include <thrust/copy.h>
 #include <thrust/device_vector.h>
 #include <thrust/generate.h>
+#include <thrust/host_vector.h>
 #include <thrust/sort.h>
-#include <thrust/copy.h>
-#include <cstdlib>

-int main()
+auto main() -> int
 {
    // generate 32 M random numbers on the host
    thrust::host_vector<int> h_vec(1 << 22);
@@ -28,4 +28,3 @@ int main()
    thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
    std::cout << "Done.\n";
 }
-
--- a/day4/examples/thrust1.cu
+++ b/day4/examples/thrust1.cu
-#include <thrust/host_vector.h>
+#include <cstdlib>
+#include <thrust/copy.h>
 #include <thrust/device_vector.h>
 #include <thrust/generate.h>
+#include <thrust/host_vector.h>
 #include <thrust/sort.h>
-#include <thrust/copy.h>
-#include <cstdlib>

-int main()
+auto main() -> int
 {
    // generate 32 M random numbers on the host
    thrust::host_vector<int> h_vec(1 << 16);
@@ -24,4 +24,3 @@ int main()
    int x = thrust::reduce(d_vec.begin(), d_vec.end(), 0, thrust::plus<int>());
    std::cout << "Done. Result = " << x << "\n";
 }
-
--- a/day4/examples/thrust_usm.cu
+++ b/day4/examples/thrust_usm.cu
+#include <cstdlib>
+#include <thrust/copy.h>
+#include <thrust/generate.h>
+#include <thrust/universal_vector.h>
+#include <thrust/sort.h>
+
+auto main() -> int
+{
+    thrust::universal_vector<int> h_vec(1 << 22);
+    std::cout << "Filling host vector with random numbers\n";
+    thrust::generate(thrust::host, h_vec.begin(), h_vec.end(), rand);
+    std::cout << "Done.\n";
+
+    std::cout << "Sorting vector on device\n";
+    thrust::sort(thrust::device, h_vec.begin(), h_vec.end());
+    std::cout << "Done.\n";
+    std::cout << h_vec[1121] << "\n";
+}
--- a/day4/examples/transform_reduce.cu
+++ b/day4/examples/transform_reduce.cu
+#include <chrono>
+#include <random>
+#include <thrust/count.h>
 #include <thrust/device_vector.h>
+#include <thrust/functional.h>
 #include <thrust/host_vector.h>
 #include <thrust/transform.h>
-#include <thrust/count.h>
-#include <thrust/functional.h>
-#include <random>
-#include <chrono>

 using namespace thrust::placeholders;

-int main()
+auto main() -> int
 {
    std::random_device seed;
    std::mt19937_64 engine { seed() };
@@ -18,12 +18,11 @@ int main()
    thrust::generate(X.begin(), X.end(), gen);
    thrust::device_vector<double> dX = X;
    // auto norm = thrust::transform_reduce(dX.begin(), dX.end(), (_1 * _1), 0., (_1 + _2));
-    auto norm = thrust::transform_reduce(dX.begin(), dX.end(), 
+    auto norm = thrust::transform_reduce(
+        dX.begin(), dX.end(),
        [] __device__(auto a) { return a * a; },
        0.,
        [] __device__(auto a, auto b) { return a + b; });

    std::cout << "Transform reduce: norm = " << norm << "\n";
 }
-
-