diff --git a/CMakeLists.txt b/CMakeLists.txt index 40f402a8aafc9e45e699f220b29a02861ce7f180..29cab898e69f3b42b6059ece9fee374559ed2fa8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,13 @@ -cmake_minimum_required(VERSION 3.19) +#cmake_minimum_required(VERSION 3.19) +cmake_minimum_required(VERSION 3.30) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++20 -g") project(PARIOX CXX) +#find_package(Boost REQUIRED NO_MODULE) find_package(HPX REQUIRED) find_package(HDF5 REQUIRED CXX) find_package(MPI REQUIRED CXX) +find_package(OpenMP REQUIRED) include_directories(${HDF5_INCLUDE_DIRS}) include_directories(${MPI_INCLUDE_DIRS}) @@ -26,5 +29,11 @@ target_link_libraries(h5_hpx_write HPX::hpx HPX::wrap_main HPX::iostreams_compon add_executable(h5_hpx_read ${PARIOX_EX}/async_read_testbed.cpp ${R123_INCL}/Random123/philox.h ${R123_INCL}/Random123/uniform.hpp)#matrix_mult_write_threadsafe.cpp) target_link_libraries(h5_hpx_read HPX::hpx HPX::wrap_main HPX::iostreams_component ${HDF5_LIBRARIES} ${MPI_LIBRARIES} ${CMAKE_BINARY_DIR}/libpariox.a) +add_executable(h5_ompi_write ${PARIOX_EX}/sync_write_testbed2.cpp ${R123_INCL}/Random123/philox.h ${R123_INCL}/Random123/uniform.hpp)#matrix_mult_write_threadsafe.cpp) +target_link_libraries(h5_ompi_write ${HDF5_LIBRARIES} ${MPI_LIBRARIES} ${CMAKE_BINARY_DIR}/libpariox.a OpenMP::OpenMP_CXX) + +add_executable(h5_ompi_read ${PARIOX_EX}/sync_read_testbed2.cpp ${R123_INCL}/Random123/philox.h ${R123_INCL}/Random123/uniform.hpp)#matrix_mult_write_threadsafe.cpp) +target_link_libraries(h5_ompi_read ${HDF5_LIBRARIES} ${MPI_LIBRARIES} ${CMAKE_BINARY_DIR}/libpariox.a OpenMP::OpenMP_CXX) + #add_executable(h5_hpx_read ${PARIOX_EX}/matrix_mult_read_threadsafe.cpp) #target_link_libraries(h5_hpx_read HPX::hpx HPX::wrap_main HPX::iostreams_component ${HDF5_LIBRARIES} ${MPI_LIBRARIES} ${CMAKE_BINARY_DIR}/libpariox.a) diff --git a/examples/async_read_testbed.cpp b/examples/async_read_testbed.cpp index 71fd373764bb1b48a6a49e6581c9eaa1796112a9..048504bb977ffb92afdb5c34407c681b111dcba5 100644 --- a/examples/async_read_testbed.cpp +++ b/examples/async_read_testbed.cpp @@ -178,8 +178,16 @@ double* task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std: double *write_buffer; write_buffer = (double*)calloc(task_local_sizes, sizeof(double)); + hid_t datatype = H5Dget_type(dset_ids[0]); + hid_t dataspace = H5Dget_space(dset_ids[0]);// the in-file dataset's dataspace. + int ndims = H5Sget_simple_extent_ndims(dataspace); + + hid_t dummy_space_id = H5Screate_simple(ndims, &task_local_sizes, NULL); // describes memory data space + H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, &fspace_offset, NULL, &task_local_sizes, NULL); + clock_gettime(CLOCK_MONOTONIC, &io_start); - read_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_local_sizes, &write_buffer[0]); + H5Dread(dset_ids[0], datatype, dummy_space_id, dataspace, h5_ids[3], write_buffer); + //read_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_local_sizes, &write_buffer[0]); clock_gettime(CLOCK_MONOTONIC, &io_end); std::size_t file_idx = 0; @@ -219,7 +227,7 @@ double* task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std: double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9; double diff_io_d = p_io_end - p_io_start; double bytes_tf_rate = (task_local_sizes*8)/(diff_io_d*1024*1024); - printf("LOC: %4u, Task: %5u Start: %f, End: %f, I/O Dur: %f, SUB Dur: %f, Read_rate: %f MB/s\n", locality_id, task_instance, p_io_start, p_io_end, diff_io_d, diff_d, bytes_tf_rate); + printf("LOC: %4u, Task: %5u, Start: %f, End: %f, I/O Dur: %f, SUB Dur: %f, Read_rate: %f MB/s\n", locality_id, task_instance, p_io_start, p_io_end, diff_io_d, diff_d, bytes_tf_rate); return task_b_vec; } diff --git a/examples/async_write_testbed.cpp b/examples/async_write_testbed.cpp index 2be768b3f9950122ecdfce4e6eb50bb0456308f5..aff7971d74ae8d95efb4cc5211c1b832f797970e 100644 --- a/examples/async_write_testbed.cpp +++ b/examples/async_write_testbed.cpp @@ -103,7 +103,7 @@ double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std clock_gettime(CLOCK_MONOTONIC, &mat_prep_start); generate_2D_mat(&task_A_mat, m_rows, n_cols, seed); - + std::size_t task_size = m_rows*n_cols; double *write_buffer; std::size_t file_idx; @@ -138,9 +138,17 @@ double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std } free(task_A_mat); + hid_t datatype = H5Dget_type(dset_ids[0]); + hid_t dataspace = H5Dget_space(dset_ids[0]);// the in-file dataset's dataspace. + int ndims = H5Sget_simple_extent_ndims(dataspace); + + hid_t dummy_space_id = H5Screate_simple(ndims, &task_size, NULL); // describes memory data space + H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, &fspace_offset, NULL, &task_size, NULL); + // write the task local matrix entries. clock_gettime(CLOCK_MONOTONIC, &io_start); - write_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_size, &write_buffer[0]); + //write_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_size, &write_buffer[0]); + H5Dwrite(dset_ids[0], datatype, dummy_space_id, dataspace, h5_ids[3], write_buffer); clock_gettime(CLOCK_MONOTONIC, &io_end); free(write_buffer); @@ -158,7 +166,7 @@ double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9; double diff_io_d = p_io_end - p_io_start; double bytes_tf_rate = (task_size*8)/(diff_io_d*1024*1024); - printf("LOC: %4u, Task: %5u Start: %f, End: %f, Mat Dur: %f, I/O Dur: %f, SUB Dur: %f, Write_rate: %f MB/s\n", \ + printf("LOC: %4u, Task: %5u, Start: %f, End: %f, Mat Dur: %f, I/O Dur: %f, SUB Dur: %f, Write_rate: %f MB/s\n", \ locality_id, task_instance, p_io_start, p_io_end, diff_mat_d, diff_io_d, diff_d, bytes_tf_rate); return task_b_vec; @@ -272,7 +280,7 @@ std::vector<double> locality_work(std::size_t tot_mat_rows, std::size_t tot_mat_ h5_ids = prep_H5_file_L(MPI_COMM_WORLD, "./mat_mult_demo.h5"); dtype_id = H5T_NATIVE_DOUBLE; - + ndims = 1; dset_id = attach_H5_dataset_L(&h5_ids[0], "Mat_entries", dtype_id, ndims, &tot_size_1D); x_vec_id = attach_H5_dataset_L(&h5_ids[0], "X_vec", dtype_id, ndims, &total_mat_sizes[1]); @@ -285,7 +293,7 @@ std::vector<double> locality_work(std::size_t tot_mat_rows, std::size_t tot_mat_ attach_H5_scalar_attr_L<std::size_t>(&b_vec_id, "Total_rows", dtype_id, &total_mat_sizes[1]); //**************************************Locality ndims data write********************* -/* +/* //COMMENTED OUT // TODO: writing into 1D file data structure is totally fine though... std::size_t fspace_offsets = 0; index_offset_calc(ndims, &loc_size_1D, &tot_size_1D, locality_id, &fspace_offsets); @@ -298,7 +306,7 @@ std::vector<double> locality_work(std::size_t tot_mat_rows, std::size_t tot_mat_ } */ //========================================Write Task-offloading============================================= - //==============Automate offsets calculation for various row size============ + //==============Automate offsets calculation for various row size============ // MAKE SURE the division has no remainders for `num_chunks`! // Range of rows_per_task = [1, local_mat_sizes[0]] num_chunks = local_mat_sizes[0]/rows_per_task; @@ -348,7 +356,7 @@ std::vector<double> locality_work(std::size_t tot_mat_rows, std::size_t tot_mat_ mesg = "B_val {1} at implicit_i of {2} on locality {3}.\n"; hpx::util::format_to(hpx::cout, mesg, loc_B_vec[i], implicit_i, hpx::find_here()) << std::flush; }*/ - free(task_b_vals); + free(task_b_vals); }; hpx::wait_each(indiv_write_handling, task_writerow_state); @@ -380,13 +388,14 @@ int main(int argc, char **argv){ std::size_t tot_mat_rows, tot_mat_cols; std::size_t remainder_rows, loc_mat_rows; std::size_t rows_per_task; + if (argc < 3){ printf("Error: Insufficient number of command line argument. \n \ Usage: mpiexec -n 2 <executable> <int value for matrix's first dimension lenght> <int value for rows per task>\n"); exit(0); } - sscanf(argv[1], "%u", &tot_mat_rows); - sscanf(argv[2], "%u", &rows_per_task); + sscanf(argv[1], "%zu", &tot_mat_rows); + sscanf(argv[2], "%zu", &rows_per_task); std::vector<hpx::id_type> localities = hpx::find_all_localities(); std::uint32_t ntot_localities = hpx::get_num_localities().get(); @@ -409,7 +418,7 @@ int main(int argc, char **argv){ loc_mat_rows = tot_mat_rows / ntot_localities; } preceding_rows[i] = preceding_rows[i-1] + loc_mat_rows; -// printf("preceding_rows[%u] = %u\n", i, preceding_rows[i]); + //printf("preceding_rows[%u] = %zu\n", i, preceding_rows[i]); } std::vector<hpx::future<std::vector<double>>> locality_futures; diff --git a/examples/sync_read_testbed.cpp b/examples/sync_read_testbed.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4aa477cc20c6ad7718aa020713e9b78988cb14b9 --- /dev/null +++ b/examples/sync_read_testbed.cpp @@ -0,0 +1,316 @@ +#include <mpi.h> +#include <cstddef> +#include <cstdint> +#include <cstdlib> +#include <omp.h> + +#include "./pariox_h5.hpp" +#include <Random123/philox.h> +#include <Random123/uniform.hpp> + +int generate_2D_mat(double*** matrix, std::size_t m, std::size_t n, std::size_t locality){ + typedef r123::Philox4x32 RNG; + RNG rng; + RNG::ctr_type c={{}}; + RNG::ukey_type uk={{}}; + uk[0] = locality; // some user_supplied_seed + RNG::key_type k=uk; + RNG::ctr_type r; + + c[0] = 15112024; + c[1] = 16122024; + + std::srand(locality * 15112024); + + *matrix = (double **)calloc(m, sizeof(double*)); + for (std::size_t i = 0; i < m; i++){ + (*matrix)[i] = (double *)calloc(n, sizeof(double)); + } + + for (std::size_t i = 0; i < m; i++){ + for (std::size_t j = 0; j < n; j++){ + c[0] += 1; + c[1] += 1; + r = rng(c, k); + (*matrix)[i][j] = r123::u01<double>(r.v[0]); //static_cast<double>(std::rand()) / RAND_MAX; + //(*matrix)[i][j] = static_cast<double>(std::rand()) / RAND_MAX; + } + } + return 0; +} + +int generate_vec(double **x_vec, std::size_t n, std::size_t locality){ + std::srand(241119); + + *x_vec = (double *)calloc(n, sizeof(double*)); + for (std::size_t i = 0; i < n; i++){ + (*x_vec)[i] = static_cast<double>(std::rand()) / RAND_MAX; + } + return 0; +} + +double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \ + std::size_t m_rows, std::size_t n_cols, double *loc_X_vec, hid_t *h5_ids, \ + hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \ + std::size_t task_instance){ + //printf("ROW: %u, COL: %u, loc_A_mat: %f %f %f x_vec: %f %f %f\n", row_offset, col_offset,\ + (*loc_A_mat)[row_offset][0], (*loc_A_mat)[row_offset][1], (*loc_A_mat)[row_offset][2],\ + loc_X_vec[0], loc_X_vec[1], loc_X_vec[2]); + struct timespec start, end, io_start, io_end, mat_prep_start, mat_prep_end; + double ** task_A_mat; + std::size_t seed = locality_id + m_rows*n_cols*task_instance; + + clock_gettime(CLOCK_MONOTONIC, &mat_prep_start); + generate_2D_mat(&task_A_mat, m_rows, n_cols, seed); + + std::size_t task_size = m_rows*n_cols; + double *write_buffer; + std::size_t file_idx; + write_buffer = (double*)calloc(task_size, sizeof(double)); + for (std::size_t i = 0; i < m_rows; i++){ + for (std::size_t j = 0; j < n_cols; j++){ + file_idx = i*n_cols + j; + write_buffer[file_idx] = task_A_mat[i][j]; + } + } + clock_gettime(CLOCK_MONOTONIC, &mat_prep_end); + + clock_gettime(CLOCK_MONOTONIC, &start); + + // Compute the mat * vec + double *task_b_vec; + task_b_vec = (double *)calloc(m_rows,sizeof(double)); + + for (std::size_t i = 0; i < m_rows; i++){ + task_b_vec[i] = 0; + //task_b_vec[i] = vec_mult(loc_A_mat[row_offset + i][0], loc_X_vec, n_cols); + for (std::size_t j = 0; j < n_cols; j++){ + file_idx = i*n_cols + j; + task_b_vec[i] = task_b_vec[i] + write_buffer[file_idx]*loc_X_vec[j]; + // task_b_vec[i] = task_b_vec[i] + task_A_mat[i][j]*loc_X_vec[j]; + } + //printf("task_b_vec[%u] = %f\n", i, task_b_vec[i]); + } + + for (std::size_t i = 0; i < m_rows; i++){ + free(task_A_mat[i]); + } + free(task_A_mat); + + hid_t datatype = H5Dget_type(dset_ids[0]); + hid_t dataspace = H5Dget_space(dset_ids[0]);// the in-file dataset's dataspace. + int ndims = H5Sget_simple_extent_ndims(dataspace); + + hid_t dummy_space_id = H5Screate_simple(ndims, &task_size, NULL); // describes memory data space + H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, &fspace_offset, NULL, &task_size, NULL); + + // write the task local matrix entries. + clock_gettime(CLOCK_MONOTONIC, &io_start); + //write_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_size, &write_buffer[0]); + H5Dwrite(dset_ids[0], datatype, dummy_space_id, dataspace, h5_ids[3], write_buffer); + clock_gettime(CLOCK_MONOTONIC, &io_end); + free(write_buffer); + + clock_gettime(CLOCK_MONOTONIC, &end); + + double p_mat_start = mat_prep_start.tv_sec*1.0 + mat_prep_start.tv_nsec*1e-9; + double p_mat_end = mat_prep_end.tv_sec*1.0 + mat_prep_end.tv_nsec*1e-9; + double diff_mat_d = p_mat_end - p_mat_start; + + double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9; + double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9; + double diff_d = p_end - p_start; + + double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9; + double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9; + double diff_io_d = p_io_end - p_io_start; + double bytes_tf_rate = (task_size*8)/(diff_io_d*1024*1024); + printf("LOC: %4u, Task: %5u, Start: %f, End: %f, Mat Dur: %f, I/O Dur: %f, SUB Dur: %f, Write_rate: %f MB/s\n", \ + locality_id, task_instance, p_io_start, p_io_end, diff_mat_d, diff_io_d, diff_d, bytes_tf_rate); + + return task_b_vec; +} + +double* task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \ + std::size_t m_rows, std::size_t n_cols, double **loc_X_vec, hid_t *h5_ids, \ + hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \ + std::size_t task_instance){ + struct timespec start, end, io_start, io_end; + clock_gettime(CLOCK_MONOTONIC, &start); + + double *task_b_vec; + task_b_vec = (double *)calloc(m_rows, sizeof(double)); + + std::size_t task_local_sizes = m_rows*n_cols; + double *write_buffer; + write_buffer = (double*)calloc(task_local_sizes, sizeof(double)); + + clock_gettime(CLOCK_MONOTONIC, &io_start); + read_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_local_sizes, &write_buffer[0]); + clock_gettime(CLOCK_MONOTONIC, &io_end); + + std::size_t file_idx = 0; + /*for (std::size_t i = 0; i < m_rows; i++){ + for (std::size_t j = 0; j < n_cols; j++){ + file_idx = i*n_cols + j; + (*loc_A_mat)[row_offset + i][j] = write_buffer[file_idx]; + } + } + + for (std::size_t i = 0; i < m_rows; i++){ + //printf("Row_off: %u, %u, write_buff: %f %f %f %f\n", row_offset, fspace_offset, \ + write_buffer[0], write_buffer[1], \ + write_buffer[2], write_buffer[3]); + printf("Row_off: %u, %u, Task A mat: %f %f %f %f\n", row_offset, fspace_offset,\ + (*loc_A_mat)[row_offset + i][0], \ + (*loc_A_mat)[row_offset + i][1], \ + (*loc_A_mat)[row_offset + i][2], \ + (*loc_A_mat)[row_offset + i][3]); + } + printf("x_vec: %f %f ... %f %f\n", (*loc_X_vec)[0], (*loc_X_vec)[1], (*loc_X_vec)[n_cols-2], \ + (*loc_X_vec)[n_cols-1]);*/ + // Compute! + for (std::size_t i = 0; i < m_rows; i++){ + for (std::size_t j = 0; j < n_cols; j++){ + file_idx = i*n_cols + j; + task_b_vec[i] = task_b_vec[i] + write_buffer[file_idx] * (*loc_X_vec)[j]; + } + } + free(write_buffer); + clock_gettime(CLOCK_MONOTONIC, &end); + double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9; + double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9; + double diff_d = p_end - p_start; + + double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9; + double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9; + double diff_io_d = p_io_end - p_io_start; + double bytes_tf_rate = (task_local_sizes*8)/(diff_io_d*1024*1024); + printf("LOC: %4u, Task: %5u Start: %f, End: %f, I/O Dur: %f, SUB Dur: %f, Read_rate: %f MB/s\n", locality_id, task_instance, p_io_start, p_io_end, diff_io_d, diff_d, bytes_tf_rate); + + return task_b_vec; +} + +int main(int argc, char **argv){ + std::size_t tot_mat_rows, tot_mat_cols; + std::size_t remainder_rows, loc_mat_rows; + std::size_t rows_per_task; + if (argc < 3){ + printf("Error: Insufficient number of command line argument. \n \ + Usage: mpiexec -n 2 <executable> <int value for matrix's first dimension lenght> <int value for rows per task>\n"); + exit(0); + } + sscanf(argv[1], "%zu", &tot_mat_rows); + sscanf(argv[2], "%zu", &rows_per_task); + tot_mat_cols = tot_mat_rows; + // ================= Generate x vector===================== + std::vector<hid_t> dset_ids(3); + + std::size_t m_rows, n_cols; + double **loc_A_mat; + int rank, nprocs, num_threads; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + num_threads = omp_get_num_threads(); + + // =========================== Open a H5 file ========================= + std::size_t ndims; + std::vector<hid_t> h5_ids; + hid_t data_id, dtype_id, attr_id, dset_id, x_vec_id, b_vec_id; + + h5_ids = open_H5_file_L(MPI_COMM_WORLD, "./mat_mult_demo_ompi.h5"); + + dtype_id = H5T_NATIVE_DOUBLE; + + std::size_t *Mat_1D_lens; + std::vector<std::size_t> total_mat_sizes(2); + std::vector<std::size_t> local_mat_sizes(2); + std::size_t loc_size_1D; + dset_id = access_H5_dataset_L(&h5_ids[0], "/Mat_entries", &dtype_id, &ndims, &Mat_1D_lens); + + hid_t attr_dtype_id = H5T_NATIVE_UINT; + access_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_cols", attr_dtype_id, &total_mat_sizes[1]); + access_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_rows", attr_dtype_id, &total_mat_sizes[0]); + + local_mat_sizes[0] = total_mat_sizes[0]/nprocs; + local_mat_sizes[1] = total_mat_sizes[1]; + loc_size_1D = local_mat_sizes[0]*local_mat_sizes[1]; + + std::size_t *dummy_lens; + std::size_t dummy_ndims; + x_vec_id = access_H5_dataset_L(&h5_ids[0], "/X_vec", &dtype_id, &dummy_ndims, &dummy_lens); + b_vec_id = access_H5_dataset_L(&h5_ids[0], "/B_vec", &dtype_id, &dummy_ndims, &dummy_lens); + + m_rows = static_cast<int>(rows_per_task); + n_cols = total_mat_sizes[1]; + double *loc_b_vec, *loc_x_vec; + loc_b_vec = (double *)calloc(local_mat_sizes[0], sizeof(double)); + loc_x_vec = (double *)calloc(local_mat_sizes[1], sizeof(double)); + free(dummy_lens); + + std::size_t x_fspace_id = 0; + read_H5_ndim_data_L<double>(&h5_ids[0], &x_vec_id, &x_fspace_id, &local_mat_sizes[1], &loc_x_vec[0]); + x_fspace_id = rank*local_mat_sizes[0]; + read_H5_ndim_data_L<double>(&h5_ids[0], &b_vec_id, &x_fspace_id, &local_mat_sizes[0], &loc_b_vec[0]); + // ================= Pre-compute process & thread offsets============== + int num_chunks; + num_chunks = local_mat_sizes[0]/rows_per_task; + std::vector<std::size_t> fspace_offsets(num_chunks); + index_offset_calc(ndims, &loc_size_1D, &Mat_1D_lens[0], rank, &fspace_offsets[0]); + + std::vector<std::size_t> row_offsets(num_chunks); + std::size_t task_dim_lens = rows_per_task*total_mat_sizes[1]; + std::size_t temp; + for (std::size_t j = 0; j < num_chunks; j++){ + index_offset_calc(ndims, &task_dim_lens, &loc_size_1D, j, &temp); + fspace_offsets[j] = fspace_offsets[0] + temp; + row_offsets[j] = j*rows_per_task; + // printf("Rank: %d, thread: %d, offset: %zu %zu %zu\n", rank, j, fspace_offsets[j], m_rows, n_cols); + } + + #pragma omp parallel + { + int thread_num; + thread_num = omp_get_thread_num(); + + double *thread_loc_B; + //thread_loc_B = task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \ + std::size_t m_rows, std::size_t n_cols, double **loc_X_vec, hid_t *h5_ids, \ + hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \ + std::size_t task_instance){ + thread_loc_B = task_compute_read_work(&loc_A_mat, 0, 0, \ + m_rows, n_cols, &loc_x_vec, &h5_ids[0], \ + &dset_id, fspace_offsets[thread_num], rank, thread_num); + + std::size_t initial_offset = thread_num*m_rows; + double diff_val; + for (std::size_t k = 0; k < m_rows; k++){ + diff_val = fabs(thread_loc_B[k] - loc_b_vec[initial_offset + k]); + if (diff_val > 1e-6) printf("Error! Diff: %f\n", diff_val); + } + free(thread_loc_B); + } + + double sum_loc_b = 0; + for (std::size_t i = 0; i < local_mat_sizes[0]; i++){ + sum_loc_b += loc_b_vec[i]; + } + printf("rank %d, sum_loc_b: %f\n", rank, sum_loc_b); + + // ================= Write x & b vector per process ==================== + + free(loc_x_vec); + free(loc_b_vec); + + free(Mat_1D_lens); + + close_H5_dataset_L(&dset_id); + close_H5_dataset_L(&x_vec_id); + close_H5_dataset_L(&b_vec_id); + close_H5_file_L(&h5_ids[0]); + + MPI_Finalize(); + return 0; +} diff --git a/examples/sync_read_testbed2.cpp b/examples/sync_read_testbed2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..409ff464f6482ef69c97e1be17224a63fc127ae0 --- /dev/null +++ b/examples/sync_read_testbed2.cpp @@ -0,0 +1,301 @@ +#include <mpi.h> +#include <cstddef> +#include <cstdint> +#include <cstdlib> +#include <omp.h> + +#include "./pariox_h5.hpp" +#include <Random123/philox.h> +#include <Random123/uniform.hpp> + +int generate_2D_mat(double*** matrix, std::size_t m, std::size_t n, std::size_t locality){ + typedef r123::Philox4x32 RNG; + RNG rng; + RNG::ctr_type c={{}}; + RNG::ukey_type uk={{}}; + uk[0] = locality; // some user_supplied_seed + RNG::key_type k=uk; + RNG::ctr_type r; + + c[0] = 15112024; + c[1] = 16122024; + + std::srand(locality * 15112024); + + *matrix = (double **)calloc(m, sizeof(double*)); + for (std::size_t i = 0; i < m; i++){ + (*matrix)[i] = (double *)calloc(n, sizeof(double)); + } + + for (std::size_t i = 0; i < m; i++){ + for (std::size_t j = 0; j < n; j++){ + c[0] += 1; + c[1] += 1; + r = rng(c, k); + (*matrix)[i][j] = r123::u01<double>(r.v[0]); //static_cast<double>(std::rand()) / RAND_MAX; + //(*matrix)[i][j] = static_cast<double>(std::rand()) / RAND_MAX; + } + } + return 0; +} + +int generate_vec(double **x_vec, std::size_t n, std::size_t locality){ + std::srand(241119); + + *x_vec = (double *)calloc(n, sizeof(double*)); + for (std::size_t i = 0; i < n; i++){ + (*x_vec)[i] = static_cast<double>(std::rand()) / RAND_MAX; + } + return 0; +} + +double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \ + std::size_t m_rows, std::size_t n_cols, double *loc_X_vec, hid_t *h5_ids, \ + hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \ + std::size_t task_instance){ + //printf("ROW: %u, COL: %u, loc_A_mat: %f %f %f x_vec: %f %f %f\n", row_offset, col_offset,\ + (*loc_A_mat)[row_offset][0], (*loc_A_mat)[row_offset][1], (*loc_A_mat)[row_offset][2],\ + loc_X_vec[0], loc_X_vec[1], loc_X_vec[2]); + struct timespec start, end, io_start, io_end, mat_prep_start, mat_prep_end; + double ** task_A_mat; + std::size_t seed = locality_id + m_rows*n_cols*task_instance; + + clock_gettime(CLOCK_MONOTONIC, &mat_prep_start); + generate_2D_mat(&task_A_mat, m_rows, n_cols, seed); + + std::size_t task_size = m_rows*n_cols; + double *write_buffer; + std::size_t file_idx; + write_buffer = (double*)calloc(task_size, sizeof(double)); + for (std::size_t i = 0; i < m_rows; i++){ + for (std::size_t j = 0; j < n_cols; j++){ + file_idx = i*n_cols + j; + write_buffer[file_idx] = task_A_mat[i][j]; + } + } + clock_gettime(CLOCK_MONOTONIC, &mat_prep_end); + + clock_gettime(CLOCK_MONOTONIC, &start); + + // Compute the mat * vec + double *task_b_vec; + task_b_vec = (double *)calloc(m_rows,sizeof(double)); + + for (std::size_t i = 0; i < m_rows; i++){ + task_b_vec[i] = 0; + //task_b_vec[i] = vec_mult(loc_A_mat[row_offset + i][0], loc_X_vec, n_cols); + for (std::size_t j = 0; j < n_cols; j++){ + file_idx = i*n_cols + j; + task_b_vec[i] = task_b_vec[i] + write_buffer[file_idx]*loc_X_vec[j]; + // task_b_vec[i] = task_b_vec[i] + task_A_mat[i][j]*loc_X_vec[j]; + } + //printf("task_b_vec[%u] = %f\n", i, task_b_vec[i]); + } + + for (std::size_t i = 0; i < m_rows; i++){ + free(task_A_mat[i]); + } + free(task_A_mat); + + hid_t datatype = H5Dget_type(dset_ids[0]); + hid_t dataspace = H5Dget_space(dset_ids[0]);// the in-file dataset's dataspace. + int ndims = H5Sget_simple_extent_ndims(dataspace); + + hid_t dummy_space_id = H5Screate_simple(ndims, &task_size, NULL); // describes memory data space + H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, &fspace_offset, NULL, &task_size, NULL); + + // write the task local matrix entries. + clock_gettime(CLOCK_MONOTONIC, &io_start); + //write_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_size, &write_buffer[0]); + H5Dwrite(dset_ids[0], datatype, dummy_space_id, dataspace, h5_ids[3], write_buffer); + clock_gettime(CLOCK_MONOTONIC, &io_end); + free(write_buffer); + + clock_gettime(CLOCK_MONOTONIC, &end); + + double p_mat_start = mat_prep_start.tv_sec*1.0 + mat_prep_start.tv_nsec*1e-9; + double p_mat_end = mat_prep_end.tv_sec*1.0 + mat_prep_end.tv_nsec*1e-9; + double diff_mat_d = p_mat_end - p_mat_start; + + double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9; + double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9; + double diff_d = p_end - p_start; + + double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9; + double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9; + double diff_io_d = p_io_end - p_io_start; + double bytes_tf_rate = (task_size*8)/(diff_io_d*1024*1024); + printf("LOC: %4u, Task: %5u, Start: %f, End: %f, Mat Dur: %f, I/O Dur: %f, SUB Dur: %f, Write_rate: %f MB/s\n", \ + locality_id, task_instance, p_io_start, p_io_end, diff_mat_d, diff_io_d, diff_d, bytes_tf_rate); + + return task_b_vec; +} + +double* task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \ + std::size_t m_rows, std::size_t n_cols, double **loc_X_vec, hid_t *h5_ids, \ + hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \ + std::size_t task_instance){ + struct timespec start, end, io_start, io_end; + clock_gettime(CLOCK_MONOTONIC, &start); + + double *task_b_vec; + task_b_vec = (double *)calloc(m_rows, sizeof(double)); + + double **mat_buffer; + std::size_t dim_lens[2]; + dim_lens[0] = m_rows; + dim_lens[1] = n_cols; + clock_gettime(CLOCK_MONOTONIC, &io_start); + read_and_unflatten_2D(h5_ids, dset_ids, fspace_offset, &dim_lens[0], &mat_buffer); + clock_gettime(CLOCK_MONOTONIC, &io_end); + + std::size_t task_local_sizes = m_rows*n_cols; + // Compute! + for (std::size_t i = 0; i < m_rows; i++){ + for (std::size_t j = 0; j < n_cols; j++){ + task_b_vec[i] = task_b_vec[i] + mat_buffer[i][j] * (*loc_X_vec)[j]; + } + } + + for(std::size_t i = 0; i < dim_lens[0]; i++){ + free(mat_buffer[i]); + } + free(mat_buffer); + + clock_gettime(CLOCK_MONOTONIC, &end); + double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9; + double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9; + double diff_d = p_end - p_start; + + double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9; + double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9; + double diff_io_d = p_io_end - p_io_start; + double bytes_tf_rate = (task_local_sizes*8)/(diff_io_d*1024*1024); + printf("LOC: %4u, Task: %5u Start: %f, End: %f, I/O Dur: %f, SUB Dur: %f, Read_rate: %f MB/s\n", locality_id, task_instance, p_io_start, p_io_end, diff_io_d, diff_d, bytes_tf_rate); + + return task_b_vec; +} + +int main(int argc, char **argv){ + std::size_t tot_mat_rows, tot_mat_cols; + std::size_t remainder_rows, loc_mat_rows; + std::size_t rows_per_task; + if (argc < 3){ + printf("Error: Insufficient number of command line argument. \n \ + Usage: mpiexec -n 2 <executable> <int value for matrix's first dimension lenght> <int value for rows per task>\n"); + exit(0); + } + sscanf(argv[1], "%zu", &tot_mat_rows); + sscanf(argv[2], "%zu", &rows_per_task); + tot_mat_cols = tot_mat_rows; + // ================= Generate x vector===================== + std::vector<hid_t> dset_ids(3); + + std::size_t m_rows, n_cols; + double **loc_A_mat; + int rank, nprocs, num_threads; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + num_threads = omp_get_num_threads(); + + // =========================== Open a H5 file ========================= + std::size_t ndims; + std::vector<hid_t> h5_ids; + hid_t data_id, dtype_id, attr_id, dset_id, x_vec_id, b_vec_id; + + h5_ids = open_H5_file_L(MPI_COMM_WORLD, "./mat_mult_demo_ompi.h5"); + + dtype_id = H5T_NATIVE_DOUBLE; + + std::size_t *Mat_1D_lens; + std::vector<std::size_t> total_mat_sizes(2); + std::vector<std::size_t> local_mat_sizes(2); + std::size_t loc_size_1D; + dset_id = access_H5_dataset_L(&h5_ids[0], "/Mat_entries", &dtype_id, &ndims, &Mat_1D_lens); + + hid_t attr_dtype_id = H5T_NATIVE_UINT; + access_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_cols", attr_dtype_id, &total_mat_sizes[1]); + access_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_rows", attr_dtype_id, &total_mat_sizes[0]); + + local_mat_sizes[0] = total_mat_sizes[0]/nprocs; + local_mat_sizes[1] = total_mat_sizes[1]; + loc_size_1D = local_mat_sizes[0]*local_mat_sizes[1]; + + std::size_t *dummy_lens; + std::size_t dummy_ndims; + x_vec_id = access_H5_dataset_L(&h5_ids[0], "/X_vec", &dtype_id, &dummy_ndims, &dummy_lens); + b_vec_id = access_H5_dataset_L(&h5_ids[0], "/B_vec", &dtype_id, &dummy_ndims, &dummy_lens); + + m_rows = static_cast<int>(rows_per_task); + n_cols = total_mat_sizes[1]; + double *loc_b_vec, *loc_x_vec; + loc_b_vec = (double *)calloc(local_mat_sizes[0], sizeof(double)); + loc_x_vec = (double *)calloc(local_mat_sizes[1], sizeof(double)); + free(dummy_lens); + + std::size_t x_fspace_id = 0; + read_H5_ndim_data_L<double>(&h5_ids[0], &x_vec_id, &x_fspace_id, &local_mat_sizes[1], &loc_x_vec[0]); + x_fspace_id = rank*local_mat_sizes[0]; + read_H5_ndim_data_L<double>(&h5_ids[0], &b_vec_id, &x_fspace_id, &local_mat_sizes[0], &loc_b_vec[0]); + // ================= Pre-compute process & thread offsets============== + int num_chunks; + num_chunks = local_mat_sizes[0]/rows_per_task; + std::vector<std::size_t> fspace_offsets(num_chunks); + index_offset_calc(ndims, &loc_size_1D, &Mat_1D_lens[0], rank, &fspace_offsets[0]); + + std::vector<std::size_t> row_offsets(num_chunks); + std::size_t task_dim_lens = rows_per_task*total_mat_sizes[1]; + std::size_t temp; + for (std::size_t j = 0; j < num_chunks; j++){ + index_offset_calc(ndims, &task_dim_lens, &loc_size_1D, j, &temp); + fspace_offsets[j] = fspace_offsets[0] + temp; + row_offsets[j] = j*rows_per_task; + // printf("Rank: %d, thread: %d, offset: %zu %zu %zu\n", rank, j, fspace_offsets[j], m_rows, n_cols); + } + + #pragma omp parallel + { + int thread_num; + thread_num = omp_get_thread_num(); + + double *thread_loc_B; + //thread_loc_B = task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \ + std::size_t m_rows, std::size_t n_cols, double **loc_X_vec, hid_t *h5_ids, \ + hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \ + std::size_t task_instance){ + thread_loc_B = task_compute_read_work(&loc_A_mat, 0, 0, \ + m_rows, n_cols, &loc_x_vec, &h5_ids[0], \ + &dset_id, fspace_offsets[thread_num], rank, thread_num); + + std::size_t initial_offset = thread_num*m_rows; + double diff_val; + for (std::size_t k = 0; k < m_rows; k++){ + diff_val = fabs(thread_loc_B[k] - loc_b_vec[initial_offset + k]); + if (diff_val > 1e-6) printf("Error! Diff: %f\n", diff_val); + } + free(thread_loc_B); + } + + double sum_loc_b = 0; + for (std::size_t i = 0; i < local_mat_sizes[0]; i++){ + sum_loc_b += loc_b_vec[i]; + } + printf("rank %d, sum_loc_b: %f\n", rank, sum_loc_b); + + // ================= Write x & b vector per process ==================== + + free(loc_x_vec); + free(loc_b_vec); + + free(Mat_1D_lens); + + close_H5_dataset_L(&dset_id); + close_H5_dataset_L(&x_vec_id); + close_H5_dataset_L(&b_vec_id); + close_H5_file_L(&h5_ids[0]); + + MPI_Finalize(); + return 0; +} diff --git a/examples/sync_write_testbed.cpp b/examples/sync_write_testbed.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0a1f838bf6935b063c1f6973a7f3cc884d0f6047 --- /dev/null +++ b/examples/sync_write_testbed.cpp @@ -0,0 +1,316 @@ +#include <mpi.h> +#include <cstddef> +#include <cstdint> +#include <cstdlib> +#include <omp.h> + +#include "./pariox_h5.hpp" +#include <Random123/philox.h> +#include <Random123/uniform.hpp> + +int generate_2D_mat(double*** matrix, std::size_t m, std::size_t n, std::size_t locality){ + typedef r123::Philox4x32 RNG; + RNG rng; + RNG::ctr_type c={{}}; + RNG::ukey_type uk={{}}; + uk[0] = locality; // some user_supplied_seed + RNG::key_type k=uk; + RNG::ctr_type r; + + c[0] = 15112024; + c[1] = 16122024; + + std::srand(locality * 15112024); + + *matrix = (double **)calloc(m, sizeof(double*)); + for (std::size_t i = 0; i < m; i++){ + (*matrix)[i] = (double *)calloc(n, sizeof(double)); + } + + for (std::size_t i = 0; i < m; i++){ + for (std::size_t j = 0; j < n; j++){ + c[0] += 1; + c[1] += 1; + r = rng(c, k); + (*matrix)[i][j] = r123::u01<double>(r.v[0]); //static_cast<double>(std::rand()) / RAND_MAX; + //(*matrix)[i][j] = static_cast<double>(std::rand()) / RAND_MAX; + } + } + return 0; +} + +int generate_vec(double **x_vec, std::size_t n, std::size_t locality){ + std::srand(241119); + + *x_vec = (double *)calloc(n, sizeof(double*)); + for (std::size_t i = 0; i < n; i++){ + (*x_vec)[i] = static_cast<double>(std::rand()) / RAND_MAX; + } + return 0; +} + +double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \ + std::size_t m_rows, std::size_t n_cols, double *loc_X_vec, hid_t *h5_ids, \ + hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \ + std::size_t task_instance){ + //printf("ROW: %u, COL: %u, loc_A_mat: %f %f %f x_vec: %f %f %f\n", row_offset, col_offset,\ + (*loc_A_mat)[row_offset][0], (*loc_A_mat)[row_offset][1], (*loc_A_mat)[row_offset][2],\ + loc_X_vec[0], loc_X_vec[1], loc_X_vec[2]); + struct timespec start, end, io_start, io_end, mat_prep_start, mat_prep_end; + double ** task_A_mat; + std::size_t seed = locality_id + m_rows*n_cols*task_instance; + + clock_gettime(CLOCK_MONOTONIC, &mat_prep_start); + generate_2D_mat(&task_A_mat, m_rows, n_cols, seed); + + std::size_t task_size = m_rows*n_cols; + std::size_t file_idx; + double *write_buffer; + + //======================try out flattening functions=================== + double **ptr_of_pointers; + std::size_t dim_lens[2]; + dim_lens[0] = m_rows; + dim_lens[1] = n_cols; + prep_pointers(&dim_lens[0], &ptr_of_pointers, task_A_mat); + flatten(2, &dim_lens[0], ptr_of_pointers, &write_buffer); + free(ptr_of_pointers); + //====================================================================== + //====================Normal way of flattening========================== +/* write_buffer = (double*)calloc(task_size, sizeof(double)); + for (std::size_t i = 0; i < m_rows; i++){ + for (std::size_t j = 0; j < n_cols; j++){ + file_idx = i*n_cols + j; + write_buffer[file_idx] = task_A_mat[i][j]; + } + }*/ + //======================================================================= + clock_gettime(CLOCK_MONOTONIC, &mat_prep_end); + + clock_gettime(CLOCK_MONOTONIC, &start); + + // Compute the mat * vec + double *task_b_vec; + task_b_vec = (double *)calloc(m_rows,sizeof(double)); + + for (std::size_t i = 0; i < m_rows; i++){ + task_b_vec[i] = 0; + //task_b_vec[i] = vec_mult(loc_A_mat[row_offset + i][0], loc_X_vec, n_cols); + for (std::size_t j = 0; j < n_cols; j++){ + file_idx = i*n_cols + j; + task_b_vec[i] = task_b_vec[i] + write_buffer[file_idx]*loc_X_vec[j]; + // task_b_vec[i] = task_b_vec[i] + task_A_mat[i][j]*loc_X_vec[j]; + } + //printf("task_b_vec[%u] = %f\n", i, task_b_vec[i]); + } + + for (std::size_t i = 0; i < m_rows; i++){ + free(task_A_mat[i]); + } + free(task_A_mat); + + hid_t datatype = H5Dget_type(dset_ids[0]); + hid_t dataspace = H5Dget_space(dset_ids[0]);// the in-file dataset's dataspace. + int ndims = H5Sget_simple_extent_ndims(dataspace); + + hid_t dummy_space_id = H5Screate_simple(ndims, &task_size, NULL); // describes memory data space + H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, &fspace_offset, NULL, &task_size, NULL); + + // write the task local matrix entries. + clock_gettime(CLOCK_MONOTONIC, &io_start); + //write_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_size, &write_buffer[0]); + H5Dwrite(dset_ids[0], datatype, dummy_space_id, dataspace, h5_ids[3], write_buffer); + clock_gettime(CLOCK_MONOTONIC, &io_end); + free(write_buffer); + + clock_gettime(CLOCK_MONOTONIC, &end); + + double p_mat_start = mat_prep_start.tv_sec*1.0 + mat_prep_start.tv_nsec*1e-9; + double p_mat_end = mat_prep_end.tv_sec*1.0 + mat_prep_end.tv_nsec*1e-9; + double diff_mat_d = p_mat_end - p_mat_start; + + double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9; + double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9; + double diff_d = p_end - p_start; + + double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9; + double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9; + double diff_io_d = p_io_end - p_io_start; + double bytes_tf_rate = (task_size*8)/(diff_io_d*1024*1024); + printf("LOC: %4u, Task: %5u, Start: %f, End: %f, Mat Dur: %f, I/O Dur: %f, SUB Dur: %f, Write_rate: %f MB/s\n", \ + locality_id, task_instance, p_io_start, p_io_end, diff_mat_d, diff_io_d, diff_d, bytes_tf_rate); + + return task_b_vec; +} + +double* task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \ + std::size_t m_rows, std::size_t n_cols, double **loc_X_vec, hid_t *h5_ids, \ + hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \ + std::size_t task_instance){ + struct timespec start, end, io_start, io_end; + clock_gettime(CLOCK_MONOTONIC, &start); + + double *task_b_vec; + task_b_vec = (double *)calloc(m_rows, sizeof(double)); + + std::size_t task_local_sizes = m_rows*n_cols; + double *write_buffer; + write_buffer = (double*)calloc(task_local_sizes, sizeof(double)); + + clock_gettime(CLOCK_MONOTONIC, &io_start); + read_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_local_sizes, &write_buffer[0]); + clock_gettime(CLOCK_MONOTONIC, &io_end); + + std::size_t file_idx = 0; + /*for (std::size_t i = 0; i < m_rows; i++){ + for (std::size_t j = 0; j < n_cols; j++){ + file_idx = i*n_cols + j; + (*loc_A_mat)[row_offset + i][j] = write_buffer[file_idx]; + } + } + + for (std::size_t i = 0; i < m_rows; i++){ + //printf("Row_off: %u, %u, write_buff: %f %f %f %f\n", row_offset, fspace_offset, \ + write_buffer[0], write_buffer[1], \ + write_buffer[2], write_buffer[3]); + printf("Row_off: %u, %u, Task A mat: %f %f %f %f\n", row_offset, fspace_offset,\ + (*loc_A_mat)[row_offset + i][0], \ + (*loc_A_mat)[row_offset + i][1], \ + (*loc_A_mat)[row_offset + i][2], \ + (*loc_A_mat)[row_offset + i][3]); + } + printf("x_vec: %f %f ... %f %f\n", (*loc_X_vec)[0], (*loc_X_vec)[1], (*loc_X_vec)[n_cols-2], \ + (*loc_X_vec)[n_cols-1]);*/ + // Compute! + for (std::size_t i = 0; i < m_rows; i++){ + for (std::size_t j = 0; j < n_cols; j++){ + file_idx = i*n_cols + j; + task_b_vec[i] = task_b_vec[i] + write_buffer[file_idx] * (*loc_X_vec)[j]; + } + } + free(write_buffer); + clock_gettime(CLOCK_MONOTONIC, &end); + double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9; + double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9; + double diff_d = p_end - p_start; + + double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9; + double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9; + double diff_io_d = p_io_end - p_io_start; + double bytes_tf_rate = (task_local_sizes*8)/(diff_io_d*1024*1024); + printf("LOC: %4u, Task: %5u Start: %f, End: %f, I/O Dur: %f, SUB Dur: %f, Read_rate: %f MB/s\n", locality_id, task_instance, p_io_start, p_io_end, diff_io_d, diff_d, bytes_tf_rate); + + return task_b_vec; +} + +int main(int argc, char **argv){ + std::size_t tot_mat_rows, tot_mat_cols; + std::size_t remainder_rows, loc_mat_rows; + std::size_t rows_per_task; + if (argc < 3){ + printf("Error: Insufficient number of command line argument. \n \ + Usage: mpiexec -n 2 <executable> <int value for matrix's first dimension lenght> <int value for rows per task>\n"); + exit(0); + } + sscanf(argv[1], "%zu", &tot_mat_rows); + sscanf(argv[2], "%zu", &rows_per_task); + tot_mat_cols = tot_mat_rows; + // ================= Generate x vector===================== + double *loc_x_vec; + generate_vec(&loc_x_vec, tot_mat_rows, 0); + + std::size_t m_rows, n_cols; + double **loc_A_mat; + int rank, nprocs, num_threads; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + num_threads = omp_get_num_threads(); + loc_mat_rows = tot_mat_rows/nprocs; + m_rows = static_cast<int>(rows_per_task); + n_cols = tot_mat_rows; + + double *loc_b_vec; + loc_b_vec = (double *)calloc(loc_mat_rows, sizeof(double)); + // =========================== Open a H5 file ========================= + int ndims; + std::size_t tot_size_1D; + tot_size_1D = tot_mat_cols*tot_mat_rows; + std::vector<hid_t> h5_ids; + hid_t data_id, dtype_id, attr_id, dset_id, x_vec_id, b_vec_id; + + h5_ids = prep_H5_file_L(MPI_COMM_WORLD, "./mat_mult_demo_ompi.h5"); + + dtype_id = H5T_NATIVE_DOUBLE; + + ndims = 1; + dset_id = attach_H5_dataset_L(&h5_ids[0], "Mat_entries", dtype_id, ndims, &tot_size_1D); + x_vec_id = attach_H5_dataset_L(&h5_ids[0], "X_vec", dtype_id, ndims, &tot_mat_cols); + b_vec_id = attach_H5_dataset_L(&h5_ids[0], "B_vec", dtype_id, ndims, &tot_mat_cols); + + dtype_id = H5T_NATIVE_UINT; + attach_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_rows", dtype_id, &tot_mat_rows); + attach_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_cols", dtype_id, &tot_mat_cols); + attach_H5_scalar_attr_L<std::size_t>(&x_vec_id, "Total_rows", dtype_id, &tot_mat_cols); + attach_H5_scalar_attr_L<std::size_t>(&b_vec_id, "Total_rows", dtype_id, &tot_mat_cols); + + + // ================= Pre-compute process & thread offsets============== + std::size_t loc_size_1D; + int num_chunks; + num_chunks = loc_mat_rows/rows_per_task; + std::vector<std::size_t> fspace_offsets(num_chunks); + loc_size_1D = tot_mat_cols*loc_mat_rows; + index_offset_calc(ndims, &loc_size_1D, &tot_size_1D, rank, &fspace_offsets[0]); + + std::vector<std::size_t> row_offsets(num_chunks); + std::size_t task_dim_lens = rows_per_task*tot_mat_cols; + std::size_t temp; + for (std::size_t j = 0; j < num_chunks; j++){ + index_offset_calc(ndims, &task_dim_lens, &loc_size_1D, j, &temp); + fspace_offsets[j] = fspace_offsets[0] + temp; + row_offsets[j] = j*rows_per_task; + // printf("Rank: %d, thread: %d, offset: %zu %zu %zu\n", rank, j, fspace_offsets[j], m_rows, n_cols); + } + + #pragma omp parallel + { + int thread_num; + thread_num = omp_get_thread_num(); + + double *thread_loc_B; + thread_loc_B = task_compute_write_work(&loc_A_mat, 0, 0, \ + m_rows, n_cols, &loc_x_vec[0], &h5_ids[0], \ + &dset_id, fspace_offsets[thread_num], rank, thread_num); + + std::size_t initial_offset = thread_num*m_rows; + for (std::size_t k = 0; k < m_rows; k++){ + loc_b_vec[initial_offset + k] = thread_loc_B[k]; + } + free(thread_loc_B); + } + + double sum_loc_b = 0; + for (std::size_t i = 0; i < loc_mat_rows; i++){ + sum_loc_b += loc_b_vec[i]; + } + printf("rank %d, sum_loc_b: %f\n", rank, sum_loc_b); + + ndims = 1; + index_offset_calc(ndims, &loc_mat_rows, &tot_mat_rows, rank, &fspace_offsets[0]); + write_H5_ndim_data_L(&h5_ids[0], &b_vec_id, &fspace_offsets[0], &loc_mat_rows, &loc_b_vec[0]); + write_H5_ndim_data_L(&h5_ids[0], &x_vec_id, &fspace_offsets[0], &loc_mat_rows, &loc_x_vec[fspace_offsets[0]]); + // ================= Write x & b vector per process ==================== + + free(loc_x_vec); + free(loc_b_vec); + + close_H5_dataset_L(&dset_id); + close_H5_dataset_L(&x_vec_id); + close_H5_dataset_L(&b_vec_id); + close_H5_file_L(&h5_ids[0]); + + MPI_Finalize(); + return 0; +} diff --git a/examples/sync_write_testbed2.cpp b/examples/sync_write_testbed2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0c27382d8fcb894f9dc573812654a63ddfdb1425 --- /dev/null +++ b/examples/sync_write_testbed2.cpp @@ -0,0 +1,292 @@ +#include <mpi.h> +#include <cstddef> +#include <cstdint> +#include <cstdlib> +#include <omp.h> + +#include "./pariox_h5.hpp" +#include <Random123/philox.h> +#include <Random123/uniform.hpp> + +int generate_2D_mat(double*** matrix, std::size_t m, std::size_t n, std::size_t locality){ + typedef r123::Philox4x32 RNG; + RNG rng; + RNG::ctr_type c={{}}; + RNG::ukey_type uk={{}}; + uk[0] = locality; // some user_supplied_seed + RNG::key_type k=uk; + RNG::ctr_type r; + + c[0] = 15112024; + c[1] = 16122024; + + std::srand(locality * 15112024); + + *matrix = (double **)calloc(m, sizeof(double*)); + for (std::size_t i = 0; i < m; i++){ + (*matrix)[i] = (double *)calloc(n, sizeof(double)); + } + + for (std::size_t i = 0; i < m; i++){ + for (std::size_t j = 0; j < n; j++){ + c[0] += 1; + c[1] += 1; + r = rng(c, k); + (*matrix)[i][j] = r123::u01<double>(r.v[0]); //static_cast<double>(std::rand()) / RAND_MAX; + //(*matrix)[i][j] = static_cast<double>(std::rand()) / RAND_MAX; + } + } + return 0; +} + +int generate_vec(double **x_vec, std::size_t n, std::size_t locality){ + std::srand(241119); + + *x_vec = (double *)calloc(n, sizeof(double*)); + for (std::size_t i = 0; i < n; i++){ + (*x_vec)[i] = static_cast<double>(std::rand()) / RAND_MAX; + } + return 0; +} + +double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \ + std::size_t m_rows, std::size_t n_cols, double *loc_X_vec, hid_t *h5_ids, \ + hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \ + std::size_t task_instance){ + //printf("ROW: %u, COL: %u, loc_A_mat: %f %f %f x_vec: %f %f %f\n", row_offset, col_offset,\ + (*loc_A_mat)[row_offset][0], (*loc_A_mat)[row_offset][1], (*loc_A_mat)[row_offset][2],\ + loc_X_vec[0], loc_X_vec[1], loc_X_vec[2]); + struct timespec start, end, io_start, io_end, mat_prep_start, mat_prep_end; + double ** task_A_mat; + std::size_t seed = locality_id + m_rows*n_cols*task_instance; + + clock_gettime(CLOCK_MONOTONIC, &mat_prep_start); + generate_2D_mat(&task_A_mat, m_rows, n_cols, seed); + + std::size_t task_size = m_rows*n_cols; + + //======================try out flattening functions=================== + std::size_t dim_lens[2]; + dim_lens[0] = m_rows; + dim_lens[1] = n_cols; + //====================================================================== + clock_gettime(CLOCK_MONOTONIC, &mat_prep_end); + + clock_gettime(CLOCK_MONOTONIC, &start); + + // Compute the mat * vec + double *task_b_vec; + task_b_vec = (double *)calloc(m_rows,sizeof(double)); + + for (std::size_t i = 0; i < m_rows; i++){ + task_b_vec[i] = 0; + //task_b_vec[i] = vec_mult(loc_A_mat[row_offset + i][0], loc_X_vec, n_cols); + for (std::size_t j = 0; j < n_cols; j++){ + task_b_vec[i] = task_b_vec[i] + task_A_mat[i][j]*loc_X_vec[j]; + } + //printf("task_b_vec[%u] = %f\n", i, task_b_vec[i]); + } + + //============================Alternative 1D write============================ + clock_gettime(CLOCK_MONOTONIC, &io_start); + flatten_and_write_2D(h5_ids, dset_ids, fspace_offset, dim_lens, &task_A_mat); + clock_gettime(CLOCK_MONOTONIC, &io_end); + + //============================================================================ + + clock_gettime(CLOCK_MONOTONIC, &end); + + double p_mat_start = mat_prep_start.tv_sec*1.0 + mat_prep_start.tv_nsec*1e-9; + double p_mat_end = mat_prep_end.tv_sec*1.0 + mat_prep_end.tv_nsec*1e-9; + double diff_mat_d = p_mat_end - p_mat_start; + + double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9; + double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9; + double diff_d = p_end - p_start; + + double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9; + double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9; + double diff_io_d = p_io_end - p_io_start; + double bytes_tf_rate = (task_size*8)/(diff_io_d*1024*1024); + printf("LOC: %4u, Task: %5u, Start: %f, End: %f, Mat Dur: %f, I/O Dur: %f, SUB Dur: %f, Write_rate: %f MB/s\n", \ + locality_id, task_instance, p_io_start, p_io_end, diff_mat_d, diff_io_d, diff_d, bytes_tf_rate); + + for (std::size_t i = 0; i < m_rows; i++){ + free(task_A_mat[i]); + } + free(task_A_mat); + + return task_b_vec; +} + +double* task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \ + std::size_t m_rows, std::size_t n_cols, double **loc_X_vec, hid_t *h5_ids, \ + hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \ + std::size_t task_instance){ + struct timespec start, end, io_start, io_end; + clock_gettime(CLOCK_MONOTONIC, &start); + + double *task_b_vec; + task_b_vec = (double *)calloc(m_rows, sizeof(double)); + + std::size_t task_local_sizes = m_rows*n_cols; + double *write_buffer; + write_buffer = (double*)calloc(task_local_sizes, sizeof(double)); + + clock_gettime(CLOCK_MONOTONIC, &io_start); + read_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_local_sizes, &write_buffer[0]); + clock_gettime(CLOCK_MONOTONIC, &io_end); + + std::size_t file_idx = 0; + /*for (std::size_t i = 0; i < m_rows; i++){ + for (std::size_t j = 0; j < n_cols; j++){ + file_idx = i*n_cols + j; + (*loc_A_mat)[row_offset + i][j] = write_buffer[file_idx]; + } + } + + for (std::size_t i = 0; i < m_rows; i++){ + //printf("Row_off: %u, %u, write_buff: %f %f %f %f\n", row_offset, fspace_offset, \ + write_buffer[0], write_buffer[1], \ + write_buffer[2], write_buffer[3]); + printf("Row_off: %u, %u, Task A mat: %f %f %f %f\n", row_offset, fspace_offset,\ + (*loc_A_mat)[row_offset + i][0], \ + (*loc_A_mat)[row_offset + i][1], \ + (*loc_A_mat)[row_offset + i][2], \ + (*loc_A_mat)[row_offset + i][3]); + } + printf("x_vec: %f %f ... %f %f\n", (*loc_X_vec)[0], (*loc_X_vec)[1], (*loc_X_vec)[n_cols-2], \ + (*loc_X_vec)[n_cols-1]);*/ + // Compute! + for (std::size_t i = 0; i < m_rows; i++){ + for (std::size_t j = 0; j < n_cols; j++){ + file_idx = i*n_cols + j; + task_b_vec[i] = task_b_vec[i] + write_buffer[file_idx] * (*loc_X_vec)[j]; + } + } + free(write_buffer); + clock_gettime(CLOCK_MONOTONIC, &end); + double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9; + double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9; + double diff_d = p_end - p_start; + + double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9; + double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9; + double diff_io_d = p_io_end - p_io_start; + double bytes_tf_rate = (task_local_sizes*8)/(diff_io_d*1024*1024); + printf("LOC: %4u, Task: %5u Start: %f, End: %f, I/O Dur: %f, SUB Dur: %f, Read_rate: %f MB/s\n", locality_id, task_instance, p_io_start, p_io_end, diff_io_d, diff_d, bytes_tf_rate); + + return task_b_vec; +} + +int main(int argc, char **argv){ + std::size_t tot_mat_rows, tot_mat_cols; + std::size_t remainder_rows, loc_mat_rows; + std::size_t rows_per_task; + if (argc < 3){ + printf("Error: Insufficient number of command line argument. \n \ + Usage: mpiexec -n 2 <executable> <int value for matrix's first dimension lenght> <int value for rows per task>\n"); + exit(0); + } + sscanf(argv[1], "%zu", &tot_mat_rows); + sscanf(argv[2], "%zu", &rows_per_task); + tot_mat_cols = tot_mat_rows; + // ================= Generate x vector===================== + double *loc_x_vec; + generate_vec(&loc_x_vec, tot_mat_rows, 0); + + std::size_t m_rows, n_cols; + double **loc_A_mat; + int rank, nprocs, num_threads; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + num_threads = omp_get_num_threads(); + loc_mat_rows = tot_mat_rows/nprocs; + m_rows = static_cast<int>(rows_per_task); + n_cols = tot_mat_rows; + + double *loc_b_vec; + loc_b_vec = (double *)calloc(loc_mat_rows, sizeof(double)); + // =========================== Open a H5 file ========================= + int ndims; + std::size_t tot_size_1D; + tot_size_1D = tot_mat_cols*tot_mat_rows; + std::vector<hid_t> h5_ids; + hid_t data_id, dtype_id, attr_id, dset_id, x_vec_id, b_vec_id; + + h5_ids = prep_H5_file_L(MPI_COMM_WORLD, "./mat_mult_demo_ompi.h5"); + + dtype_id = H5T_NATIVE_DOUBLE; + + ndims = 1; + dset_id = attach_H5_dataset_L(&h5_ids[0], "Mat_entries", dtype_id, ndims, &tot_size_1D); + x_vec_id = attach_H5_dataset_L(&h5_ids[0], "X_vec", dtype_id, ndims, &tot_mat_cols); + b_vec_id = attach_H5_dataset_L(&h5_ids[0], "B_vec", dtype_id, ndims, &tot_mat_cols); + + dtype_id = H5T_NATIVE_UINT; + attach_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_rows", dtype_id, &tot_mat_rows); + attach_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_cols", dtype_id, &tot_mat_cols); + attach_H5_scalar_attr_L<std::size_t>(&x_vec_id, "Total_rows", dtype_id, &tot_mat_cols); + attach_H5_scalar_attr_L<std::size_t>(&b_vec_id, "Total_rows", dtype_id, &tot_mat_cols); + + + // ================= Pre-compute process & thread offsets============== + std::size_t loc_size_1D; + int num_chunks; + num_chunks = loc_mat_rows/rows_per_task; + std::vector<std::size_t> fspace_offsets(num_chunks); + loc_size_1D = tot_mat_cols*loc_mat_rows; + index_offset_calc(ndims, &loc_size_1D, &tot_size_1D, rank, &fspace_offsets[0]); + + std::vector<std::size_t> row_offsets(num_chunks); + std::size_t task_dim_lens = rows_per_task*tot_mat_cols; + std::size_t temp; + for (std::size_t j = 0; j < num_chunks; j++){ + index_offset_calc(ndims, &task_dim_lens, &loc_size_1D, j, &temp); + fspace_offsets[j] = fspace_offsets[0] + temp; + row_offsets[j] = j*rows_per_task; + // printf("Rank: %d, thread: %d, offset: %zu %zu %zu\n", rank, j, fspace_offsets[j], m_rows, n_cols); + } + + #pragma omp parallel + { + int thread_num; + thread_num = omp_get_thread_num(); + + double *thread_loc_B; + thread_loc_B = task_compute_write_work(&loc_A_mat, 0, 0, \ + m_rows, n_cols, &loc_x_vec[0], &h5_ids[0], \ + &dset_id, fspace_offsets[thread_num], rank, thread_num); + + std::size_t initial_offset = thread_num*m_rows; + for (std::size_t k = 0; k < m_rows; k++){ + loc_b_vec[initial_offset + k] = thread_loc_B[k]; + } + free(thread_loc_B); + } + + double sum_loc_b = 0; + for (std::size_t i = 0; i < loc_mat_rows; i++){ + sum_loc_b += loc_b_vec[i]; + } + printf("rank %d, sum_loc_b: %f\n", rank, sum_loc_b); + + ndims = 1; + index_offset_calc(ndims, &loc_mat_rows, &tot_mat_rows, rank, &fspace_offsets[0]); + write_H5_ndim_data_L(&h5_ids[0], &b_vec_id, &fspace_offsets[0], &loc_mat_rows, &loc_b_vec[0]); + write_H5_ndim_data_L(&h5_ids[0], &x_vec_id, &fspace_offsets[0], &loc_mat_rows, &loc_x_vec[fspace_offsets[0]]); + // ================= Write x & b vector per process ==================== + + free(loc_x_vec); + free(loc_b_vec); + + close_H5_dataset_L(&dset_id); + close_H5_dataset_L(&x_vec_id); + close_H5_dataset_L(&b_vec_id); + close_H5_file_L(&h5_ids[0]); + + MPI_Finalize(); + return 0; +} diff --git a/src/pariox_h5.cpp b/src/pariox_h5.cpp index 15da7d3a9618e2b03c74bc24653acf3a02298bf9..8e54facc0501b281dda04c3b800420bf8ad3b9f5 100644 --- a/src/pariox_h5.cpp +++ b/src/pariox_h5.cpp @@ -1,6 +1,6 @@ #include "./pariox_h5.hpp" -int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size_t *total_lens, \ +int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size_t *dim_lens, \ std::size_t curr_index, std::size_t *outcome){ std::size_t remainder = 1; std::size_t *base; @@ -10,7 +10,7 @@ int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size std::vector<std::size_t> unit_counts(ndims); for (std::size_t i = 0; i < ndims; i++){ - unit_counts[i] = total_lens[i]/unit_chunk_lens[i]; + unit_counts[i] = dim_lens[i]/unit_chunk_lens[i]; } for (std::size_t i = ndims-1; i > 0; i--){ @@ -19,7 +19,7 @@ int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size } std::size_t i = 0; - while ((i < ndims-1) && (remainder != 0)){ + while (i < ndims-1){ outcome[i] = floor(curr_index/base[i]); remainder = curr_index%base[i]; curr_index = remainder; @@ -34,8 +34,9 @@ int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size for (std::size_t i = 0; i < ndims; i++){ outcome[i] *= unit_chunk_lens[i]; } + free(base); } else if (ndims == 1){ - std::size_t unit_count = *total_lens/ *unit_chunk_lens; + std::size_t unit_count = *dim_lens/ *unit_chunk_lens; *outcome = (curr_index%unit_count) * (*unit_chunk_lens); } return 0; @@ -43,11 +44,19 @@ int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size std::vector<hid_t> prep_H5_file_L(MPI_Comm input_comm, const char* file_name){ std::vector<hid_t> h5_ids; + hbool_t file_locking, dummy_bool; h5_ids.resize(4); + hid_t dummy_file_handle; h5_ids[0] = H5Pcreate(H5P_FILE_ACCESS); H5Pset_fapl_mpio(h5_ids[0], input_comm, MPI_INFO_NULL); + H5Pget_file_locking(h5_ids[0], &file_locking, &dummy_bool); + if (file_locking != 0){ + printf("File locking was TRUE.\n"); + H5Pset_file_locking(h5_ids[0], 0, 1); + } + h5_ids[1] = H5Pcreate(H5P_FILE_CREATE); h5_ids[2] = H5Fcreate(file_name, H5F_ACC_TRUNC, h5_ids[1], h5_ids[0]); diff --git a/src/pariox_h5.hpp b/src/pariox_h5.hpp index 02095d5244780359bb3682bab3f1bf338a0f1d20..4693448a58218699caf89ecd0cf058e50a6a9834 100644 --- a/src/pariox_h5.hpp +++ b/src/pariox_h5.hpp @@ -13,9 +13,230 @@ #include <cmath> #include <vector> -int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size_t *total_lens, \ +int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size_t *dim_lens, \ std::size_t curr_index, std::size_t *outcome); +template <typename T> +int prep_pointers(std::size_t *dim_lens, T ***ptr_of_pointers, T **mat){ + std::size_t ndim = 2; + std::size_t ptr_size = 1; + std::size_t total_size = 1; + std::size_t dim_iter[ndim]; + std::size_t unit_chunk_lens[ndim]; + + for (std::size_t i = 0; i < ndim-1; i++){ + ptr_size *= dim_lens[i]; + unit_chunk_lens[i] = 1; + } + unit_chunk_lens[ndim-1] = 1; + (*ptr_of_pointers) = (T **)calloc(ptr_size,sizeof(T *)); + + total_size = ptr_size * dim_lens[ndim-1]; + std::size_t ptr_i = 0; + for (std::size_t idx = 0; idx < total_size; idx++){ + index_offset_calc(ndim, unit_chunk_lens, dim_lens, idx, &dim_iter[0]); + if (idx%dim_lens[ndim-1] == 0){ + (*ptr_of_pointers)[ptr_i] = &mat[dim_iter[0]][0]; + ptr_i++; + } + } + return 0; +}; + +template <typename T> +int prep_pointers(std::size_t *dim_lens, T ***ptr_of_pointers, T ***mat){ + std::size_t ndim = 3; + std::size_t ptr_size = 1; + std::size_t total_size = 1; + std::size_t dim_iter[ndim]; + std::size_t unit_chunk_lens[ndim]; + + for (std::size_t i = 0; i < ndim-1; i++){ + ptr_size *= dim_lens[i]; + unit_chunk_lens[i] = 1; + } + unit_chunk_lens[ndim-1] = 1; + (*ptr_of_pointers) = (T **)calloc(ptr_size,sizeof(T *)); + + total_size = ptr_size * dim_lens[ndim-1]; + std::size_t ptr_i = 0; + for (std::size_t idx = 0; idx < total_size; idx++){ + index_offset_calc(ndim, unit_chunk_lens, dim_lens, idx, &dim_iter[0]); + if (idx%dim_lens[ndim-1] == 0){ + (*ptr_of_pointers)[ptr_i] = &mat[dim_iter[0]][dim_iter[1]][0]; + ptr_i++; + } + } + return 0; +}; + +template <typename T> +int prep_pointers(std::size_t *dim_lens, T ***ptr_of_pointers, T ****mat){ + std::size_t ndim = 4; + std::size_t ptr_size = 1; + std::size_t total_size = 1; + std::size_t dim_iter[ndim]; + std::size_t unit_chunk_lens[ndim]; + + for (std::size_t i = 0; i < ndim-1; i++){ + ptr_size *= dim_lens[i]; + unit_chunk_lens[i] = 1; + } + unit_chunk_lens[ndim-1] = 1; + (*ptr_of_pointers) = (T **)calloc(ptr_size,sizeof(T *)); + + total_size = ptr_size * dim_lens[ndim-1]; + std::size_t ptr_i = 0; + for (std::size_t idx = 0; idx < total_size; idx++){ + index_offset_calc(ndim, unit_chunk_lens, dim_lens, idx, &dim_iter[0]); + if (idx%dim_lens[ndim-1] == 0){ + (*ptr_of_pointers)[ptr_i] = &mat[dim_iter[0]][dim_iter[1]][dim_iter[2]][0]; + ptr_i++; + } + } + return 0; +}; + +template <typename T> +int prep_pointers(std::size_t *dim_lens, T ***ptr_of_pointers, T *****mat){ + std::size_t ndim = 5; + std::size_t ptr_size = 1; + std::size_t total_size = 1; + std::size_t dim_iter[ndim]; + std::size_t unit_chunk_lens[ndim]; + + for (std::size_t i = 0; i < ndim-1; i++){ + ptr_size *= dim_lens[i]; + unit_chunk_lens[i] = 1; + } + unit_chunk_lens[ndim-1] = 1; + (*ptr_of_pointers) = (T **)calloc(ptr_size,sizeof(T *)); + + total_size = ptr_size * dim_lens[ndim-1]; + std::size_t ptr_i = 0; + for (std::size_t idx = 0; idx < total_size; idx++){ + index_offset_calc(ndim, unit_chunk_lens, dim_lens, idx, &dim_iter[0]); + if (idx%dim_lens[ndim-1] == 0){ + (*ptr_of_pointers)[ptr_i] = &mat[dim_iter[0]][dim_iter[1]][dim_iter[2]][dim_iter[3]][0]; + ptr_i++; + } + } + return 0; +}; + +template <typename T> +int prep_pointers(std::size_t *dim_lens, T ***ptr_of_pointers, T ******mat){ + std::size_t ndim = 6; + std::size_t ptr_size = 1; + std::size_t total_size = 1; + std::size_t dim_iter[ndim]; + std::size_t unit_chunk_lens[ndim]; + + for (std::size_t i = 0; i < ndim-1; i++){ + ptr_size *= dim_lens[i]; + unit_chunk_lens[i] = 1; + } + unit_chunk_lens[ndim-1] = 1; + (*ptr_of_pointers) = (T **)calloc(ptr_size,sizeof(T *)); + + total_size = ptr_size * dim_lens[ndim-1]; + std::size_t ptr_i = 0; + for (std::size_t idx = 0; idx < total_size; idx++){ + index_offset_calc(ndim, unit_chunk_lens, dim_lens, idx, &dim_iter[0]); + if (idx%dim_lens[ndim-1] == 0){ + (*ptr_of_pointers)[ptr_i] = &mat[dim_iter[0]][dim_iter[1]][dim_iter[2]]\ + [dim_iter[3]][dim_iter[4]][0]; + ptr_i++; + } + } + return 0; +}; + +template <typename T> +int flatten_old(std::size_t cur_dim, std::size_t total_ndim, std::size_t *sizes, std::size_t *indices, \ + T **ptr_of_pointers, T **buffer_1D){ + if (cur_dim < total_ndim - 1){ + if (cur_dim == 0){ + std::size_t total_size = 1; + for (std::size_t i = 0; i < total_ndim; i++){ + total_size *= sizes[i]; + } + *buffer_1D = (T *)calloc(total_size, sizeof(T)); + } + + for(std::size_t i = 0; i < sizes[cur_dim]; i++){ + indices[cur_dim] = i; + flatten(cur_dim + 1, total_ndim, sizes, indices, ptr_of_pointers, buffer_1D); + } + }else{ + std::size_t idx = 0; + std::size_t buff_idx = 0; + std::size_t temp_idx = 0; + std::size_t *cumu_sizes; + T *ptr; + cumu_sizes = (std::size_t *) calloc(total_ndim, sizeof(std::size_t)); + for (std::size_t i = 0; i < total_ndim; i++){ + cumu_sizes[i] = 1; + for (std::size_t j = i+1; j < total_ndim - 1; j++){ + cumu_sizes[i] *= sizes[j]; + } + } + + for (std::size_t i = 0; i < total_ndim - 1; i++){ + idx += indices[i]*cumu_sizes[i]; + } + ptr = ptr_of_pointers[idx]; + temp_idx = idx*sizes[total_ndim-1]; + for (std::size_t i = 0; i < sizes[total_ndim-1]; i++){ + buff_idx = temp_idx + i; + (*buffer_1D)[buff_idx] = ptr[i]; + } + free(cumu_sizes); + } + return 0; +}; + +template <typename T> +int flatten(std::size_t total_ndim, std::size_t *dim_lens, \ + T **ptr_of_pointers, T **buffer_1D){ + std::size_t ptr_size = 1; + std::size_t total_size = 1; + std::size_t j, idx; + for (std::size_t i = 0; i < total_ndim - 1; i++){ + ptr_size *= dim_lens[i]; + } + total_size = ptr_size * dim_lens[total_ndim - 1]; + *buffer_1D = (T *)calloc(total_size, sizeof(T)); + + for (std::size_t i = 0; i < total_size; i++){ + j = i%dim_lens[total_ndim - 1]; + idx = i/dim_lens[total_ndim - 1]; + (*buffer_1D)[i] = ptr_of_pointers[idx][j]; + } + + return 0; +}; + +template <typename T> +int unflatten(std::size_t total_ndim, std::size_t *dim_lens, \ + T **ptr_of_pointers, T **buffer_1D){ + std::size_t ptr_size = 1; + std::size_t total_size = 1; + std::size_t j, idx; + for (std::size_t i = 0; i < total_ndim - 1; i++){ + ptr_size *= dim_lens[i]; + } + total_size = ptr_size * dim_lens[total_ndim - 1]; + + for (std::size_t i = 0; i < total_size; i++){ + j = i%dim_lens[total_ndim - 1]; + idx = i/dim_lens[total_ndim - 1]; + ptr_of_pointers[idx][j] = (*buffer_1D)[i]; + } + + return 0; +}; + /* Description of the std::vector<hid_t> output, h5_ids. h5_ids[0] = fapl_id h5_ids[1] = fcpl_id @@ -73,7 +294,7 @@ int write_H5_ndim_data_T(hid_t *h5_ids, hid_t *data_id, std::size_t *fspace_idx, hid_t datatype = H5Dget_type(*data_id); hid_t dataspace = H5Dget_space(*data_id);// the in-file dataset's dataspace. int ndims = H5Sget_simple_extent_ndims(dataspace); - std::vector<std::size_t> dim_lens(ndims); + std::size_t dim_lens[ndims]; //printf("fspace_idxs: %u %u task_local_sizes: %u %u matrix_entry: %f\n", fspace_idx[0], fspace_idx[1], task_local_sizes[0], task_local_sizes[1], data_write[0]); std::size_t *offsets; @@ -159,4 +380,53 @@ int read_H5_ndim_data_T(hid_t* h5_ids, hid_t *data_id, std::size_t *fspace_idx, return 0; }; + +template <typename T> +int flatten_and_write_2D(hid_t *h5_ids, hid_t *data_id, std::size_t fspace_offset, \ + std::size_t *dim_lens, T ***write_buffer){ + std::size_t ndims = 2; + std::size_t size_1D = 1; + T **ptr_of_pointers; + T *buffer_1D; + prep_pointers(dim_lens, &ptr_of_pointers, (*write_buffer)); + flatten(ndims, dim_lens, ptr_of_pointers, &buffer_1D); + free(ptr_of_pointers); + + for (std::size_t i = 0; i < ndims; i++){ + size_1D *= dim_lens[i]; + } + + write_H5_ndim_data_L(h5_ids, data_id, &fspace_offset, &size_1D, &buffer_1D[0]); + free(buffer_1D); + + return 0; +}; + +template <typename T> +int read_and_unflatten_2D(hid_t *h5_ids, hid_t *data_id, std::size_t fspace_offset, \ + std::size_t *dim_lens, T ***read_buffer){ + std::size_t ndims = 2; + std::size_t size_1D = 1; + T **ptr_of_pointers; + T *buffer_1D; + + for (std::size_t i = 0; i < ndims; i++){ + size_1D *= dim_lens[i]; + } + buffer_1D = (T *)calloc(size_1D, sizeof(T)); + + (*read_buffer) = (T **)calloc(dim_lens[0], sizeof(T *)); + for (std::size_t i = 0; i < dim_lens[0]; i++){ + (*read_buffer)[i] = (T *)calloc(dim_lens[1], sizeof(T)); + } + + prep_pointers(dim_lens, &ptr_of_pointers, (*read_buffer)); + read_H5_ndim_data_L(h5_ids, data_id, &fspace_offset, &size_1D, buffer_1D); + unflatten(ndims, dim_lens, ptr_of_pointers, &buffer_1D); + free(ptr_of_pointers); + free(buffer_1D); + + return 0; +} + #endif