Select Git revision
Sindhu Vasireddy authored
task4.py 2.42 KiB
import cupy
import mpi4py.MPI as MPI
from numba import cuda
import timeit
TPB = 1024
@cuda.jit
def sum_values(v, a):
"""Sum all values in v.
Parameters
----------
v: numpy.ndarray
One-dimensional array with values to be summed.
a: numpy.ndarray
One-element array to store the sum.
"""
# Shared memory array for partial sums
shared = cuda.shared.array(shape=TPB, dtype=cupy.float32)
tid = cuda.threadIdx.x
i = cuda.grid(1)
if i < v.size:
shared[tid] = v[i]
else:
shared[tid] = 0.0
cuda.syncthreads()
# TODO: Perform parallel reduction in shared memory
if tid == 0:
a[cuda.blockIdx.x] = shared[0]
def main():
comm = MPI.COMM_WORLD
my_rank = comm.Get_rank()
number_of_ranks = comm.Get_size()
number_of_gpus_per_node = len(cuda.gpus)
print(f"Rank {number_of_ranks} on {number_of_gpus_per_node}")
device_id = my_rank % number_of_gpus_per_node
cuda.select_device(device_id)
gpu_info = comm.gather((my_rank, device_id), root=0)
N = 10000000
a_partial = cupy.empty(N // number_of_ranks, dtype=cupy.float32)
partial_sum = cupy.zeros((a_partial.size + TPB - 1) // TPB, dtype=cupy.float32)
# Set up the launch configuration for the GPU
block = TPB
grid = (a_partial.size + block - 1) // block
# Printing GPU information from rank 0 to verify
if my_rank == 0:
for rank, gpu_id in gpu_info:
print(f"Rank {rank} is using GPU {gpu_id}")
# Create an array with N elements on the root process
if my_rank == 0:
a = cupy.arange(N, dtype=cupy.float32)
else:
a = None
# TODO: Use MPI to scatter the array to all processes and sum the values into total_Sum
# Hint: Use Task2 and Task3 for reference
proc_info = comm.gather((my_rank, partial_sum,a_partial), root=0)
if my_rank == 0:
for rank_proc,psum,part in proc_info:
print(f"Rank {rank_proc}/{number_of_ranks} for array {part.size}/{a.size} partial sum {psum}")
if my_rank == 0:
return total_sum
if __name__ == "__main__":
start_time = timeit.default_timer()
result = main()
if result is not None:
print(f"Sum of array: {result}")
print(f"Direct sum of array: {cupy.sum(cupy.arange(10000000))}")
execution_time = (timeit.default_timer() - start_time)
print(f"Execution time: {execution_time}")