Initial 2021 version

26cd17cc · Dirk Pleiter · b50e2800 · 26cd17cc · 26cd17cc · 26cd17cc
Commit 26cd17cc authored Aug 16, 2021 by Dirk Pleiter
--- a/README.md
+++ b/README.md
-# Material for CUDA lab exercises
+# PDC Summer School 2021: CUDA Lab Exercises

 ## Sources

-Source code for the exercises can be found under `lab_1/C`, `lab_1/Fortran`, 
+The source code for the exercises can be found under `lab_1/C`, `lab_1/Fortran`, 
 `lab_2/C` and `lab_2/Fortran`.

-## Instructions for exercises 
+## Instructions for the exercises 

- [Lab 1 C](lab_1/README.md)
- [Lab 1 Fortran](lab_1/README.md) and [specific Fortran guidelines](lab_1/Fortran/README.md)
- [Lab 2 C](lab_2/README.md)
- [Lab 2 Fortran](lab_2/README.md) and [specific Fortran guidelines](lab_1/Fortran/README.md)
+The instructions for the different exercises are here:
+
+- Lab 1
+  - C programmers: [Lab 1](lab_1/README.md)
+  - Fortran programmers: [Lab 1](lab_1/README.md) and [specific Fortran guidelines](lab_1/Fortran/README.md)
+- Lab 2: [Lab 2](lab_1/README.md)
\ No newline at end of file
--- a/lab_1/C/lab01_ex1.cu
+++ b/lab_1/C/lab01_ex1.cu
@@ -30,7 +30,7 @@ int main(int argc, char **argv)
    gpu_helloworld<<<grid, block>>>();
    
    ////////////////
-    // TO-DO #1.2 ////////////////////
+    // TO-DO A1.2 ////////////////////
    // Introduce your changes here! //
    //////////////////////////////////
    

--- a/lab_1/C/lab01_ex2.cu
+++ b/lab_1/C/lab01_ex2.cu
@@ -42,7 +42,7 @@ void cpu_saxpy(int n, float a, float *x, float *y)
 }

 ////////////////
-// TO-DO #2.6 /////////////////////////////////////////////////////////////
+// TO-DO A2.6 /////////////////////////////////////////////////////////////
 // Declare the kernel gpu_saxpy() with the same interface as cpu_saxpy() //
 ///////////////////////////////////////////////////////////////////////////

@@ -54,12 +54,12 @@ int main(int argc, char **argv)
    float error = 0.0f;
    
    ////////////////
-    // TO-DO #2.2 ///////////////////////////////
+    // TO-DO A2.2 ///////////////////////////////
    // Introduce the grid and block definition //
    /////////////////////////////////////////////
    
    //////////////////
-    // TO-DO #2.3.1 /////////////////////////////
+    // TO-DO A2.3.1 /////////////////////////////
    // Declare the device pointers d_x and d_y //
    /////////////////////////////////////////////
    
@@ -83,7 +83,7 @@ int main(int argc, char **argv)
    }
    
    //////////////////
-    // TO-DO #2.3.2 ////////////////////////////////////////////////////////
+    // TO-DO A2.3.2 ////////////////////////////////////////////////////////
    // Allocate d_x and d_y on the GPU, and copy the content from the CPU //
    ////////////////////////////////////////////////////////////////////////
    
@@ -94,12 +94,12 @@ int main(int argc, char **argv)
    error = generate_hash(ARRAY_SIZE, y);
    
    ////////////////
-    // TO-DO #2.4 ////////////////////////////////////////
+    // TO-DO A2.4 ////////////////////////////////////////
    // Call the GPU kernel gpu_saxpy() with d_x and d_y //
    //////////////////////////////////////////////////////
    
    //////////////////
-    // TO-DO #2.5.1 ////////////////////////////////////////////////////
+    // TO-DO A2.5.1 ////////////////////////////////////////////////////
    // Copy the content of d_y from the GPU to the array y on the CPU //
    ////////////////////////////////////////////////////////////////////
    
@@ -119,7 +119,7 @@ int main(int argc, char **argv)
    free(y);
    
    //////////////////
-    // TO-DO #2.5.2 /////////
+    // TO-DO A2.5.2 /////////
    // Release d_x and d_y //
    /////////////////////////
    

--- a/lab_1/CUDA_Lab01.pdf
+++ b/lab_1/CUDA_Lab01.pdf
--- a/lab_1/CUDA_Lab01_Fortran_Guidelines.pdf
+++ b/lab_1/CUDA_Lab01_Fortran_Guidelines.pdf
--- a/lab_1/Fortran/README.md
+++ b/lab_1/Fortran/README.md
-# Guidelines for CUDA Fortran
+# PDC Summer School 2021: CUDA Laboratory 1 / Guidelines for CUDA Fortran

-In this document, we are going to cover the very basic CUDA Fortran concepts in
-comparison with C. We ask you to follow the CUDA Laboratory 1 description in C
-and to use this document to understand what would be your changes in Fortran.
+In this document, we are going to cover the very basic CUDA Fortran concepts in comparison with C. We ask you to follow the CUDA Laboratory 1 description in C and to use this document to understand what would be your changes in Fortran.

 ## Compiling a CUDA Fortran Program

-The compilation is very similar to CUDA C, but with slight variations. First,
-you need to load not only the CUDA module, but also the PGI compiler:
+The compilation is very similar to CUDA C, but with slight variations. First, you need to load not only the CUDA module, but also the PGI compiler:

 ```
-module load cuda/7.0 pgi
+module load cuda pgi
 ```

-To compile a CUDA Fortran program, use ``pgfortran`` and include the architecture
-(i.e., ``cc3x``):
+To compile a CUDA Fortran program, use ``pgfortran`` and include the architecture (i.e., ``cc3x``):

 ```
 pgfortran -Mcuda=cc3x your_cuda_file.cuf -o your_cuda_file.out
 ```

-You can run a program as in the CUDA C version, allocating a node first with
-``salloc`` and then running the code with ``srun``:
+You can run a program as in the CUDA C version, allocating a node first with ``salloc`` and then running the code with ``srun``:

 ```
 srun -n 1 ./your_cuda_file.out
@@ -29,9 +24,7 @@ srun -n 1 ./your_cuda_file.out

 ## Kernel Management

-The concept of ``grid`` and ``block`` is the same as in CUDA C. In this case, you need
-to declare both variables as ``type(dim3)``. This is an example with a grid of 1
-block of 32 threads in X:
+The concept of ``grid`` and ``block`` is the same as in CUDA C. In this case, you need to declare both variables as ``type(dim3)``. This is an example with a grid of 1 block of 32 threads in X:

 ```
 type(dim3) :: grid
@@ -48,31 +41,22 @@ call your_kernel<<<grid, block>>>( ... )

 ## Memory Management

-To allocate memory on the GPU and release it afterwards, use the ``cudaMalloc()``
-and the ``cudaFree()`` functions. You need to declare the variable with the ``device``
-attribute:
+To allocate memory on the GPU and release it afterwards, use the ``cudaMalloc()`` and the ``cudaFree()`` functions. You need to declare the variable with the ``device`` attribute:

 ```
 real, allocatable, device :: d_x(:)
 hr = cudaMalloc(d_x, 256)
 ```

-Here, we declare an array ``d_x`` of type ``real, allocatable`` to be used on the
-GPU. Then, we use ``cudaMalloc()`` to define the size with 256 elements. We have
-captured the status result of the operation in an integer ``hr``, in case we would
-like to check if there were any errors.
+Here, we declare an array ``d_x`` of type ``real, allocatable`` to be used on the GPU. Then, we use ``cudaMalloc()`` to define the size with 256 elements. We have captured the status result of the operation in an integer ``hr``, in case we would like to check if there were any errors.

-To copy memory from the host to the GPU (or viceversa), use the ``cudaMemcpy()``
-function:
+To copy memory from the host to the GPU (or viceversa), use the ``cudaMemcpy()`` function:

 ```
 hr = cudaMemcpy(d_x, x, ARRAY_SIZE) ! Copy the content from x to d_x
 ```

-Compared to the CUDA C version, the main difference is that we no longer have
-to specify the direction of the copy. In this case, we are copying from ``x`` (on
-the CPU) to ``d_x`` (on the GPU). But we could revert the direction by simply
-swapping the variables:
+Compared to the CUDA C version, the main difference is that we no longer have to specify the direction of the copy. In this case, we are copying from ``x`` (on the CPU) to ``d_x`` (on the GPU). But we could revert the direction by simply swapping the variables:

 ```
 hr = cudaMemcpy(x, d_x, ARRAY_SIZE) ! Copy the content from d_x to x
@@ -80,9 +64,7 @@ hr = cudaMemcpy(x, d_x, ARRAY_SIZE) ! Copy the content from d_x to x

 ## Kernel Implementation

-The CUDA Fortran kernels are once again very similar to their CUDA C
-counterpart. In this case, you need to declare a new subroutine with the ``global``
-attribute, such as:
+The CUDA Fortran kernels are once again very similar to their CUDA C counterpart. In this case, you need to declare a new subroutine with the ``global`` attribute, such as:

 ```
 attributes(global) subroutine your_kernel(n, d_x)
@@ -90,26 +72,21 @@ attributes(global) subroutine your_kernel(n, d_x)
 end subroutine your_kernel
 ```

-The type definition of the constant arguments, such as "n" in the
-previous example, must contain the attribute ``value``:
+The type definition of the constant arguments, such as "n" in the previous example, must contain the attribute ``value``:

 ```
 integer, value :: n
 ```

-We also recommend you to specify the intent of the input parameter. In the case
-of ``d_x``, we could declare it as (note that, inside the kernel, we do not specify
-the ``device`` attribute):
+We also recommend you to specify the intent of the input parameter. In the case of ``d_x``, we could declare it as (note that, inside the kernel, we do not specify the ``device`` attribute):

 ```
 real, intent(inout) :: d_x(:)
 ```

-Finally, the predefined constants ``gridDim``, ``blockDim``, ``blockIdx`` and
-``threadIdx``, are all available inside the CUDA Fortran kernel:
+Finally, the predefined constants ``gridDim``, ``blockDim``, ``blockIdx`` and ``threadIdx``, are all available inside the CUDA Fortran kernel:

 ```
 integer :: tid
 tid = (blockIdx%x - 1) * blockDim%x + threadIdx%x
 ```
-
--- a/lab_1/Fortran/lab01_ex2.cuf
+++ b/lab_1/Fortran/lab01_ex2.cuf
@@ -32,7 +32,7 @@ end module HelperSubroutines
 module CUDAKernels
 contains
 !////////////////
-!// TO-DO #2.6 /////////////////////////////////////////////////////////////
+!// TO-DO A2.6 /////////////////////////////////////////////////////////////
 !// Declare the kernel gpu_saxpy() with the same interface as cpu_saxpy() //
 !///////////////////////////////////////////////////////////////////////////
 end module CUDAKernels
@@ -54,12 +54,12 @@ program lab01_ex2
    integer                   :: hr
    
    !////////////////
-    !// TO-DO #2.2 ///////////////////////////////
+    !// TO-DO A2.2 ///////////////////////////////
    !// Introduce the grid and block definition //
    !/////////////////////////////////////////////
    
    !//////////////////
-    !// TO-DO #2.3.1 /////////////////////////////
+    !// TO-DO A2.3.1 /////////////////////////////
    !// Declare the device pointers d_x and d_y //
    !/////////////////////////////////////////////
    
@@ -81,7 +81,7 @@ program lab01_ex2
    y = 0.2
    
    !//////////////////
-    !// TO-DO #2.3.2 ////////////////////////////////////////////////////////
+    !// TO-DO A2.3.2 ////////////////////////////////////////////////////////
    !// Allocate d_x and d_y on the GPU, and copy the content from the CPU //
    !////////////////////////////////////////////////////////////////////////
    
@@ -92,12 +92,12 @@ program lab01_ex2
    call generate_hash(ARRAY_SIZE, y, error);
    
    !////////////////
-    !// TO-DO #2.4 ////////////////////////////////////////
+    !// TO-DO A2.4 ////////////////////////////////////////
    !// Call the GPU kernel gpu_saxpy() with d_x and d_y //
    !//////////////////////////////////////////////////////
    
    !//////////////////
-    !// TO-DO #2.5.1 ////////////////////////////////////////////////////
+    !// TO-DO A2.5.1 ////////////////////////////////////////////////////
    !// Copy the content of d_y from the GPU to the array y on the CPU //
    !////////////////////////////////////////////////////////////////////
    
@@ -118,7 +118,7 @@ program lab01_ex2
    deallocate(y)
    
    !//////////////////
-    !// TO-DO #2.5.2 /////////
+    !// TO-DO A2.5.2 /////////
    !// Release d_x and d_y //
    !/////////////////////////
 end program lab01_ex2

--- a/lab_1/README.md
+++ b/lab_1/README.md
--- a/lab_2/C/lab02_ex3_6.cu
+++ b/lab_2/C/lab02_ex3_6.cu
@@ -177,7 +177,7 @@ void cpu_grayscale(int width, int height, float *image, float *image_out)
 __global__ void gpu_grayscale(int width, int height, float *image, float *image_out)
 {
    ////////////////
-    // TO-DO #4.2 /////////////////////////////////////////////
+    // TO-DO B2.2 /////////////////////////////////////////////
    // Implement the GPU version of the grayscale conversion //
    ///////////////////////////////////////////////////////////
 }
@@ -209,7 +209,7 @@ float cpu_applyFilter(float *image, int stride, float *matrix, int filter_dim)
 __device__ float gpu_applyFilter(float *image, int stride, float *matrix, int filter_dim)
 {
    ////////////////
-    // TO-DO #5.2 ////////////////////////////////////////////////
+    // TO-DO B3.2 ////////////////////////////////////////////////
    // Implement the GPU version of cpu_applyFilter()           //
    //                                                          //
    // Does it make sense to have a separate gpu_applyFilter()? //
@@ -297,7 +297,7 @@ void cpu_sobel(int width, int height, float *image, float *image_out)
 __global__ void gpu_sobel(int width, int height, float *image, float *image_out)
 {
    ////////////////
-    // TO-DO #6.1 /////////////////////////////////////
+    // TO-DO B4.1 /////////////////////////////////////
    // Implement the GPU version of the Sobel filter //
    ///////////////////////////////////////////////////
 }

--- a/lab_2/CUDA_Lab02.pdf
+++ b/lab_2/CUDA_Lab02.pdf
--- a/lab_2/Fortran/lab02_ex3_6.cuf
+++ b/lab_2/Fortran/lab02_ex3_6.cuf
@@ -7,7 +7,7 @@ contains
    !attributes(global) subroutine gpu_grayscale(width, height, image, image_out)
    !    implicit none
         !////////////////
-         !// TO-DO #4.2 /////////////////////////////////////////////
+         !// TO-DO B2.2 /////////////////////////////////////////////
         !// Implement the GPU version of the grayscale conversion //
         !///////////////////////////////////////////////////////////
    !end subroutine gpu_grayscale
@@ -52,7 +52,7 @@ contains
        integer, value    :: filter_dim
        real, intent(out) :: pixel_out
        !////////////////
-        !// TO-DO #5.2 ////////////////////////////////////////////////
+        !// TO-DO B3.2 ////////////////////////////////////////////////
        !// Implement the GPU version of cpu_applyFilter()           //
        !//                                                          //
        !// Does it make sense to have a separate gpu_applyFilter()? //
@@ -95,7 +95,7 @@ contains
    !attributes(global) subroutine gpu_sobel(width, height, image, image_out)
    !    implicit none
         !////////////////
-         !// TO-DO #6.1 /////////////////////////////////////
+         !// TO-DO B4.1 /////////////////////////////////////
         !// Implement the GPU version of the Sobel filter //
         !///////////////////////////////////////////////////
    !end subroutine gpu_sobel

--- a/lab_2/README.md
+++ b/lab_2/README.md