Skip to content
Snippets Groups Projects
Select Git revision
  • 5a91b3e142c8ebe6a7bd27f670d209ea8a1627eb
  • main default protected
2 results

task4.py

Blame
  • Sindhu Vasireddy's avatar
    .
    Sindhu Vasireddy authored
    5a91b3e1
    History
    task4.py 2.42 KiB
    import cupy
    import mpi4py.MPI as MPI
    from numba import cuda
    import timeit
    
    TPB = 1024
    
    @cuda.jit
    def sum_values(v, a):
        """Sum all values in v.
        Parameters
        ----------
        v: numpy.ndarray
            One-dimensional array with values to be summed.
        a: numpy.ndarray
            One-element array to store the sum.
        """
        # Shared memory array for partial sums
        shared = cuda.shared.array(shape=TPB, dtype=cupy.float32)
        
        tid = cuda.threadIdx.x
        i = cuda.grid(1)
        
        if i < v.size:
            shared[tid] = v[i]
        else:
            shared[tid] = 0.0
    
        cuda.syncthreads()
    
        # TODO: Perform parallel reduction in shared memory
    
        if tid == 0:
            a[cuda.blockIdx.x] = shared[0]
    
    def main():
        comm = MPI.COMM_WORLD
        my_rank = comm.Get_rank()
        number_of_ranks = comm.Get_size()
        number_of_gpus_per_node = len(cuda.gpus)
        print(f"Rank {number_of_ranks} on {number_of_gpus_per_node}")
        device_id = my_rank % number_of_gpus_per_node
        cuda.select_device(device_id)
        gpu_info = comm.gather((my_rank, device_id), root=0)
     
        N = 10000000
        a_partial = cupy.empty(N // number_of_ranks, dtype=cupy.float32)
        partial_sum = cupy.zeros((a_partial.size + TPB - 1) // TPB, dtype=cupy.float32)
       
        # Set up the launch configuration for the GPU
        block = TPB
        grid = (a_partial.size + block - 1) // block 
        
        # Printing GPU information from rank 0 to verify
        if my_rank == 0:
            for rank, gpu_id in gpu_info:
                print(f"Rank {rank} is using GPU {gpu_id}")
                
        # Create an array with N elements on the root process
        if my_rank == 0:
            a = cupy.arange(N, dtype=cupy.float32)
        else:
            a = None
    
        # TODO: Use MPI to scatter the array to all processes and sum the values into total_Sum
        # Hint: Use Task2 and Task3 for reference 
        
        proc_info = comm.gather((my_rank, partial_sum,a_partial), root=0)
        if my_rank == 0:
            for rank_proc,psum,part in proc_info:
                print(f"Rank {rank_proc}/{number_of_ranks} for array {part.size}/{a.size} partial sum {psum}")
    
        if my_rank == 0:
            return total_sum  
        
    if __name__ == "__main__":
        start_time = timeit.default_timer()
        result = main()
        if result is not None:
            print(f"Sum of array: {result}")
            print(f"Direct sum of array: {cupy.sum(cupy.arange(10000000))}")
        execution_time = (timeit.default_timer() - start_time)
        print(f"Execution time: {execution_time}")