diff --git a/CMakeLists.txt b/CMakeLists.txt
index 40f402a8aafc9e45e699f220b29a02861ce7f180..29cab898e69f3b42b6059ece9fee374559ed2fa8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,10 +1,13 @@
-cmake_minimum_required(VERSION 3.19)
+#cmake_minimum_required(VERSION 3.19)
+cmake_minimum_required(VERSION 3.30)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++20 -g")
 project(PARIOX CXX)
 
+#find_package(Boost REQUIRED NO_MODULE)
 find_package(HPX REQUIRED)
 find_package(HDF5 REQUIRED CXX)
 find_package(MPI REQUIRED CXX)
+find_package(OpenMP REQUIRED)
 
 include_directories(${HDF5_INCLUDE_DIRS})
 include_directories(${MPI_INCLUDE_DIRS})
@@ -26,5 +29,11 @@ target_link_libraries(h5_hpx_write HPX::hpx HPX::wrap_main HPX::iostreams_compon
 add_executable(h5_hpx_read ${PARIOX_EX}/async_read_testbed.cpp ${R123_INCL}/Random123/philox.h ${R123_INCL}/Random123/uniform.hpp)#matrix_mult_write_threadsafe.cpp) 
 target_link_libraries(h5_hpx_read HPX::hpx HPX::wrap_main HPX::iostreams_component ${HDF5_LIBRARIES} ${MPI_LIBRARIES} ${CMAKE_BINARY_DIR}/libpariox.a)
 
+add_executable(h5_ompi_write ${PARIOX_EX}/sync_write_testbed2.cpp ${R123_INCL}/Random123/philox.h ${R123_INCL}/Random123/uniform.hpp)#matrix_mult_write_threadsafe.cpp) 
+target_link_libraries(h5_ompi_write ${HDF5_LIBRARIES} ${MPI_LIBRARIES} ${CMAKE_BINARY_DIR}/libpariox.a OpenMP::OpenMP_CXX)
+
+add_executable(h5_ompi_read ${PARIOX_EX}/sync_read_testbed2.cpp ${R123_INCL}/Random123/philox.h ${R123_INCL}/Random123/uniform.hpp)#matrix_mult_write_threadsafe.cpp) 
+target_link_libraries(h5_ompi_read ${HDF5_LIBRARIES} ${MPI_LIBRARIES} ${CMAKE_BINARY_DIR}/libpariox.a OpenMP::OpenMP_CXX)
+
 #add_executable(h5_hpx_read ${PARIOX_EX}/matrix_mult_read_threadsafe.cpp) 
 #target_link_libraries(h5_hpx_read HPX::hpx HPX::wrap_main HPX::iostreams_component ${HDF5_LIBRARIES} ${MPI_LIBRARIES} ${CMAKE_BINARY_DIR}/libpariox.a)
diff --git a/examples/async_read_testbed.cpp b/examples/async_read_testbed.cpp
index 71fd373764bb1b48a6a49e6581c9eaa1796112a9..048504bb977ffb92afdb5c34407c681b111dcba5 100644
--- a/examples/async_read_testbed.cpp
+++ b/examples/async_read_testbed.cpp
@@ -178,8 +178,16 @@ double* task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std:
     double *write_buffer;
     write_buffer = (double*)calloc(task_local_sizes, sizeof(double));
 
+    hid_t datatype = H5Dget_type(dset_ids[0]);
+    hid_t dataspace = H5Dget_space(dset_ids[0]);// the in-file dataset's dataspace.
+    int ndims = H5Sget_simple_extent_ndims(dataspace);
+
+    hid_t dummy_space_id = H5Screate_simple(ndims, &task_local_sizes, NULL); // describes memory data space
+    H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, &fspace_offset, NULL, &task_local_sizes, NULL);
+
     clock_gettime(CLOCK_MONOTONIC, &io_start);
-    read_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_local_sizes, &write_buffer[0]);
+    H5Dread(dset_ids[0], datatype, dummy_space_id, dataspace, h5_ids[3], write_buffer);
+    //read_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_local_sizes, &write_buffer[0]);
     clock_gettime(CLOCK_MONOTONIC, &io_end);
 
     std::size_t file_idx = 0;
@@ -219,7 +227,7 @@ double* task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std:
     double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9;
     double diff_io_d = p_io_end - p_io_start;
     double bytes_tf_rate = (task_local_sizes*8)/(diff_io_d*1024*1024);
-    printf("LOC: %4u, Task: %5u Start: %f, End: %f, I/O Dur: %f, SUB Dur: %f, Read_rate: %f MB/s\n", locality_id, task_instance, p_io_start, p_io_end, diff_io_d, diff_d, bytes_tf_rate);
+    printf("LOC: %4u, Task: %5u, Start: %f, End: %f, I/O Dur: %f, SUB Dur: %f, Read_rate: %f MB/s\n", locality_id, task_instance, p_io_start, p_io_end, diff_io_d, diff_d, bytes_tf_rate);
 
     return task_b_vec;
 }
diff --git a/examples/async_write_testbed.cpp b/examples/async_write_testbed.cpp
index 2be768b3f9950122ecdfce4e6eb50bb0456308f5..aff7971d74ae8d95efb4cc5211c1b832f797970e 100644
--- a/examples/async_write_testbed.cpp
+++ b/examples/async_write_testbed.cpp
@@ -103,7 +103,7 @@ double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std
 
     clock_gettime(CLOCK_MONOTONIC, &mat_prep_start);
     generate_2D_mat(&task_A_mat, m_rows, n_cols, seed);
-
+    
     std::size_t task_size = m_rows*n_cols;
     double *write_buffer;
     std::size_t file_idx;
@@ -138,9 +138,17 @@ double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std
     }
     free(task_A_mat);
 
+    hid_t datatype = H5Dget_type(dset_ids[0]);
+    hid_t dataspace = H5Dget_space(dset_ids[0]);// the in-file dataset's dataspace.
+    int ndims = H5Sget_simple_extent_ndims(dataspace);
+
+    hid_t dummy_space_id = H5Screate_simple(ndims, &task_size, NULL); // describes memory data space
+    H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, &fspace_offset, NULL, &task_size, NULL);
+
     // write the task local matrix entries.
     clock_gettime(CLOCK_MONOTONIC, &io_start);
-    write_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_size, &write_buffer[0]);
+    //write_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_size, &write_buffer[0]);
+    H5Dwrite(dset_ids[0], datatype, dummy_space_id, dataspace, h5_ids[3], write_buffer);
     clock_gettime(CLOCK_MONOTONIC, &io_end);
     free(write_buffer);
 
@@ -158,7 +166,7 @@ double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std
     double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9;
     double diff_io_d = p_io_end - p_io_start;
     double bytes_tf_rate = (task_size*8)/(diff_io_d*1024*1024);
-    printf("LOC: %4u, Task: %5u Start: %f, End: %f, Mat Dur: %f, I/O Dur: %f, SUB Dur: %f, Write_rate: %f MB/s\n", \
+    printf("LOC: %4u, Task: %5u, Start: %f, End: %f, Mat Dur: %f, I/O Dur: %f, SUB Dur: %f, Write_rate: %f MB/s\n", \
                      locality_id, task_instance, p_io_start, p_io_end, diff_mat_d, diff_io_d, diff_d, bytes_tf_rate);
 
     return task_b_vec;
@@ -272,7 +280,7 @@ std::vector<double> locality_work(std::size_t tot_mat_rows, std::size_t tot_mat_
     h5_ids = prep_H5_file_L(MPI_COMM_WORLD, "./mat_mult_demo.h5");
 
     dtype_id = H5T_NATIVE_DOUBLE;
-    
+  
     ndims = 1;
     dset_id = attach_H5_dataset_L(&h5_ids[0], "Mat_entries", dtype_id, ndims, &tot_size_1D);
     x_vec_id = attach_H5_dataset_L(&h5_ids[0], "X_vec", dtype_id, ndims, &total_mat_sizes[1]);
@@ -285,7 +293,7 @@ std::vector<double> locality_work(std::size_t tot_mat_rows, std::size_t tot_mat_
     attach_H5_scalar_attr_L<std::size_t>(&b_vec_id, "Total_rows", dtype_id, &total_mat_sizes[1]);
 
 //**************************************Locality ndims data write*********************
-/*
+/* //COMMENTED OUT
 // TODO: writing into 1D file data structure is totally fine though...
     std::size_t fspace_offsets = 0;
     index_offset_calc(ndims, &loc_size_1D, &tot_size_1D, locality_id, &fspace_offsets);
@@ -298,7 +306,7 @@ std::vector<double> locality_work(std::size_t tot_mat_rows, std::size_t tot_mat_
     }
 */
 //========================================Write Task-offloading============================================= 
-    //==============Automate offsets calculation for various row size============
+  //==============Automate offsets calculation for various row size============
     // MAKE SURE the division has no remainders for `num_chunks`!
     // Range of rows_per_task = [1, local_mat_sizes[0]]
     num_chunks = local_mat_sizes[0]/rows_per_task;
@@ -348,7 +356,7 @@ std::vector<double> locality_work(std::size_t tot_mat_rows, std::size_t tot_mat_
             mesg = "B_val {1} at implicit_i of {2} on locality {3}.\n";
             hpx::util::format_to(hpx::cout, mesg, loc_B_vec[i], implicit_i, hpx::find_here()) << std::flush;
         }*/
-        free(task_b_vals);
+      free(task_b_vals);
     };
     hpx::wait_each(indiv_write_handling, task_writerow_state);
 
@@ -380,13 +388,14 @@ int main(int argc, char **argv){
     std::size_t tot_mat_rows, tot_mat_cols;
     std::size_t remainder_rows, loc_mat_rows;
     std::size_t rows_per_task;
+
     if (argc < 3){
         printf("Error: Insufficient number of command line argument. \n \
                 Usage: mpiexec -n 2 <executable> <int value for matrix's first dimension lenght> <int value for rows per task>\n");
         exit(0);
     }    
-    sscanf(argv[1], "%u", &tot_mat_rows);
-    sscanf(argv[2], "%u", &rows_per_task);
+    sscanf(argv[1], "%zu", &tot_mat_rows);
+    sscanf(argv[2], "%zu", &rows_per_task);
 
     std::vector<hpx::id_type> localities = hpx::find_all_localities();
     std::uint32_t ntot_localities = hpx::get_num_localities().get();
@@ -409,7 +418,7 @@ int main(int argc, char **argv){
             loc_mat_rows = tot_mat_rows / ntot_localities;
         }
         preceding_rows[i] = preceding_rows[i-1] + loc_mat_rows;
-//        printf("preceding_rows[%u] = %u\n", i, preceding_rows[i]);
+        //printf("preceding_rows[%u] = %zu\n", i, preceding_rows[i]);
     }
 
     std::vector<hpx::future<std::vector<double>>> locality_futures;
diff --git a/examples/sync_read_testbed.cpp b/examples/sync_read_testbed.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4aa477cc20c6ad7718aa020713e9b78988cb14b9
--- /dev/null
+++ b/examples/sync_read_testbed.cpp
@@ -0,0 +1,316 @@
+#include <mpi.h>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <omp.h>
+
+#include "./pariox_h5.hpp"
+#include <Random123/philox.h>
+#include <Random123/uniform.hpp>
+
+int generate_2D_mat(double*** matrix, std::size_t m, std::size_t n, std::size_t locality){
+    typedef r123::Philox4x32 RNG;
+    RNG rng;
+    RNG::ctr_type c={{}};
+    RNG::ukey_type uk={{}};
+    uk[0] = locality; // some user_supplied_seed
+    RNG::key_type k=uk;
+    RNG::ctr_type r;
+            
+    c[0] = 15112024;
+    c[1] = 16122024;
+
+    std::srand(locality * 15112024);
+
+    *matrix = (double **)calloc(m, sizeof(double*));
+    for (std::size_t i = 0; i < m; i++){
+        (*matrix)[i] = (double *)calloc(n, sizeof(double));
+    } 
+
+    for (std::size_t i = 0; i < m; i++){
+        for (std::size_t j = 0; j < n; j++){
+            c[0] += 1;
+            c[1] += 1;
+            r = rng(c, k);
+            (*matrix)[i][j] = r123::u01<double>(r.v[0]); //static_cast<double>(std::rand()) / RAND_MAX;
+            //(*matrix)[i][j] = static_cast<double>(std::rand()) / RAND_MAX;
+        }
+    }
+    return 0;
+}
+
+int generate_vec(double **x_vec, std::size_t n, std::size_t locality){
+    std::srand(241119);
+
+    *x_vec = (double *)calloc(n, sizeof(double*));
+    for (std::size_t i = 0; i < n; i++){
+        (*x_vec)[i] = static_cast<double>(std::rand()) / RAND_MAX;
+    }
+    return 0;
+}
+
+double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \
+                            std::size_t m_rows, std::size_t n_cols, double *loc_X_vec, hid_t *h5_ids, \
+                            hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \
+                            std::size_t task_instance){
+    //printf("ROW: %u, COL: %u, loc_A_mat: %f %f %f x_vec: %f %f %f\n", row_offset, col_offset,\
+                                           (*loc_A_mat)[row_offset][0], (*loc_A_mat)[row_offset][1], (*loc_A_mat)[row_offset][2],\
+                                           loc_X_vec[0], loc_X_vec[1], loc_X_vec[2]);
+    struct timespec start, end, io_start, io_end, mat_prep_start, mat_prep_end;
+    double ** task_A_mat;
+    std::size_t seed = locality_id + m_rows*n_cols*task_instance;
+
+    clock_gettime(CLOCK_MONOTONIC, &mat_prep_start);
+    generate_2D_mat(&task_A_mat, m_rows, n_cols, seed);
+
+    std::size_t task_size = m_rows*n_cols;
+    double *write_buffer;
+    std::size_t file_idx;
+    write_buffer = (double*)calloc(task_size, sizeof(double));
+    for (std::size_t i = 0; i < m_rows; i++){
+        for (std::size_t j = 0; j < n_cols; j++){
+            file_idx = i*n_cols + j;
+            write_buffer[file_idx] = task_A_mat[i][j];
+        }
+    }
+    clock_gettime(CLOCK_MONOTONIC, &mat_prep_end);
+
+    clock_gettime(CLOCK_MONOTONIC, &start);
+
+    // Compute the mat * vec
+    double *task_b_vec;
+    task_b_vec = (double *)calloc(m_rows,sizeof(double));
+    
+    for (std::size_t i = 0; i < m_rows; i++){
+        task_b_vec[i] = 0;
+        //task_b_vec[i] = vec_mult(loc_A_mat[row_offset + i][0], loc_X_vec, n_cols);
+        for (std::size_t j = 0; j < n_cols; j++){
+            file_idx = i*n_cols + j;
+            task_b_vec[i] = task_b_vec[i] + write_buffer[file_idx]*loc_X_vec[j];
+            // task_b_vec[i] = task_b_vec[i] + task_A_mat[i][j]*loc_X_vec[j];
+        }
+        //printf("task_b_vec[%u] = %f\n", i, task_b_vec[i]);
+    }
+
+    for (std::size_t i = 0; i < m_rows; i++){
+        free(task_A_mat[i]);
+    }
+    free(task_A_mat);
+
+    hid_t datatype = H5Dget_type(dset_ids[0]);
+    hid_t dataspace = H5Dget_space(dset_ids[0]);// the in-file dataset's dataspace.
+    int ndims = H5Sget_simple_extent_ndims(dataspace);
+
+    hid_t dummy_space_id = H5Screate_simple(ndims, &task_size, NULL); // describes memory data space
+    H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, &fspace_offset, NULL, &task_size, NULL);
+
+    // write the task local matrix entries.
+    clock_gettime(CLOCK_MONOTONIC, &io_start);
+    //write_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_size, &write_buffer[0]);
+    H5Dwrite(dset_ids[0], datatype, dummy_space_id, dataspace, h5_ids[3], write_buffer);
+    clock_gettime(CLOCK_MONOTONIC, &io_end);
+    free(write_buffer);
+
+    clock_gettime(CLOCK_MONOTONIC, &end);
+
+    double p_mat_start = mat_prep_start.tv_sec*1.0 + mat_prep_start.tv_nsec*1e-9;
+    double p_mat_end = mat_prep_end.tv_sec*1.0 + mat_prep_end.tv_nsec*1e-9;
+    double diff_mat_d = p_mat_end - p_mat_start;
+
+    double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9;
+    double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9;
+    double diff_d = p_end - p_start;
+
+    double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9;
+    double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9;
+    double diff_io_d = p_io_end - p_io_start;
+    double bytes_tf_rate = (task_size*8)/(diff_io_d*1024*1024);
+    printf("LOC: %4u, Task: %5u, Start: %f, End: %f, Mat Dur: %f, I/O Dur: %f, SUB Dur: %f, Write_rate: %f MB/s\n", \
+                     locality_id, task_instance, p_io_start, p_io_end, diff_mat_d, diff_io_d, diff_d, bytes_tf_rate);
+
+    return task_b_vec;
+}
+
+double* task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \
+                               std::size_t m_rows, std::size_t n_cols, double **loc_X_vec, hid_t *h5_ids, \
+                               hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \
+                               std::size_t task_instance){
+    struct timespec start, end, io_start, io_end;
+    clock_gettime(CLOCK_MONOTONIC, &start);
+
+    double *task_b_vec;
+    task_b_vec = (double *)calloc(m_rows, sizeof(double));
+
+    std::size_t task_local_sizes = m_rows*n_cols;
+    double *write_buffer;
+    write_buffer = (double*)calloc(task_local_sizes, sizeof(double));
+
+    clock_gettime(CLOCK_MONOTONIC, &io_start);
+    read_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_local_sizes, &write_buffer[0]);
+    clock_gettime(CLOCK_MONOTONIC, &io_end);
+
+    std::size_t file_idx = 0;
+    /*for (std::size_t i = 0; i < m_rows; i++){
+        for (std::size_t j = 0; j < n_cols; j++){
+            file_idx = i*n_cols + j;
+            (*loc_A_mat)[row_offset + i][j] = write_buffer[file_idx];
+        }
+    }
+   
+    for (std::size_t i = 0; i < m_rows; i++){
+        //printf("Row_off: %u, %u, write_buff: %f %f %f %f\n", row_offset, fspace_offset, \
+                                             write_buffer[0], write_buffer[1], \
+                                             write_buffer[2], write_buffer[3]);
+        printf("Row_off: %u, %u, Task A mat: %f %f %f %f\n", row_offset, fspace_offset,\
+                                            (*loc_A_mat)[row_offset + i][0], \
+                                            (*loc_A_mat)[row_offset + i][1], \
+                                            (*loc_A_mat)[row_offset + i][2], \
+                                            (*loc_A_mat)[row_offset + i][3]);
+    }
+    printf("x_vec: %f %f ... %f %f\n", (*loc_X_vec)[0], (*loc_X_vec)[1], (*loc_X_vec)[n_cols-2], \
+                                       (*loc_X_vec)[n_cols-1]);*/
+    // Compute!
+    for (std::size_t i = 0; i < m_rows; i++){
+        for (std::size_t j = 0; j < n_cols; j++){
+            file_idx = i*n_cols + j;
+            task_b_vec[i] = task_b_vec[i] + write_buffer[file_idx] * (*loc_X_vec)[j];
+        }
+    }
+    free(write_buffer);
+    clock_gettime(CLOCK_MONOTONIC, &end);
+    double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9;
+    double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9;
+    double diff_d = p_end - p_start;
+
+    double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9;
+    double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9;
+    double diff_io_d = p_io_end - p_io_start;
+    double bytes_tf_rate = (task_local_sizes*8)/(diff_io_d*1024*1024);
+    printf("LOC: %4u, Task: %5u Start: %f, End: %f, I/O Dur: %f, SUB Dur: %f, Read_rate: %f MB/s\n", locality_id, task_instance, p_io_start, p_io_end, diff_io_d, diff_d, bytes_tf_rate);
+
+    return task_b_vec;
+}
+
+int main(int argc, char **argv){
+    std::size_t tot_mat_rows, tot_mat_cols;
+    std::size_t remainder_rows, loc_mat_rows;
+    std::size_t rows_per_task;
+    if (argc < 3){
+        printf("Error: Insufficient number of command line argument. \n \
+                Usage: mpiexec -n 2 <executable> <int value for matrix's first dimension lenght> <int value for rows per task>\n");
+        exit(0);
+    }    
+    sscanf(argv[1], "%zu", &tot_mat_rows);
+    sscanf(argv[2], "%zu", &rows_per_task);
+    tot_mat_cols = tot_mat_rows;
+   // ================= Generate x vector=====================
+    std::vector<hid_t> dset_ids(3);
+
+    std::size_t m_rows, n_cols;
+    double **loc_A_mat;
+    int rank, nprocs, num_threads;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    num_threads = omp_get_num_threads();
+
+    // =========================== Open a H5 file =========================
+    std::size_t ndims;
+    std::vector<hid_t> h5_ids;
+    hid_t data_id, dtype_id, attr_id, dset_id, x_vec_id, b_vec_id;
+
+    h5_ids = open_H5_file_L(MPI_COMM_WORLD, "./mat_mult_demo_ompi.h5");
+
+    dtype_id = H5T_NATIVE_DOUBLE;
+
+    std::size_t *Mat_1D_lens;
+    std::vector<std::size_t> total_mat_sizes(2);
+    std::vector<std::size_t> local_mat_sizes(2);
+    std::size_t loc_size_1D;
+    dset_id = access_H5_dataset_L(&h5_ids[0], "/Mat_entries", &dtype_id, &ndims, &Mat_1D_lens);
+
+    hid_t attr_dtype_id = H5T_NATIVE_UINT;
+    access_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_cols", attr_dtype_id, &total_mat_sizes[1]);
+    access_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_rows", attr_dtype_id, &total_mat_sizes[0]);
+
+    local_mat_sizes[0] = total_mat_sizes[0]/nprocs;
+    local_mat_sizes[1] = total_mat_sizes[1];
+    loc_size_1D = local_mat_sizes[0]*local_mat_sizes[1];
+
+    std::size_t *dummy_lens; 
+    std::size_t dummy_ndims;
+    x_vec_id = access_H5_dataset_L(&h5_ids[0], "/X_vec", &dtype_id, &dummy_ndims, &dummy_lens);
+    b_vec_id = access_H5_dataset_L(&h5_ids[0], "/B_vec", &dtype_id, &dummy_ndims, &dummy_lens);
+    
+    m_rows = static_cast<int>(rows_per_task);
+    n_cols = total_mat_sizes[1];
+    double *loc_b_vec, *loc_x_vec;
+    loc_b_vec = (double *)calloc(local_mat_sizes[0], sizeof(double));
+    loc_x_vec = (double *)calloc(local_mat_sizes[1], sizeof(double));
+    free(dummy_lens);
+
+    std::size_t x_fspace_id = 0;
+    read_H5_ndim_data_L<double>(&h5_ids[0], &x_vec_id, &x_fspace_id, &local_mat_sizes[1], &loc_x_vec[0]);
+    x_fspace_id = rank*local_mat_sizes[0];
+    read_H5_ndim_data_L<double>(&h5_ids[0], &b_vec_id, &x_fspace_id, &local_mat_sizes[0], &loc_b_vec[0]);
+    // ================= Pre-compute process & thread offsets==============
+    int num_chunks;
+    num_chunks = local_mat_sizes[0]/rows_per_task;
+    std::vector<std::size_t> fspace_offsets(num_chunks);
+    index_offset_calc(ndims, &loc_size_1D, &Mat_1D_lens[0], rank, &fspace_offsets[0]);
+
+    std::vector<std::size_t> row_offsets(num_chunks);
+    std::size_t task_dim_lens = rows_per_task*total_mat_sizes[1];
+    std::size_t temp;
+    for (std::size_t j = 0; j < num_chunks; j++){
+        index_offset_calc(ndims, &task_dim_lens, &loc_size_1D, j, &temp);
+        fspace_offsets[j] = fspace_offsets[0] + temp;
+        row_offsets[j] = j*rows_per_task; 
+        // printf("Rank: %d, thread: %d, offset: %zu %zu %zu\n", rank, j, fspace_offsets[j], m_rows, n_cols);
+    }
+
+    #pragma omp parallel
+    {
+        int thread_num;
+        thread_num = omp_get_thread_num();
+
+        double *thread_loc_B; 
+        //thread_loc_B = task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \
+                               std::size_t m_rows, std::size_t n_cols, double **loc_X_vec, hid_t *h5_ids, \
+                               hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \
+                               std::size_t task_instance){
+        thread_loc_B = task_compute_read_work(&loc_A_mat, 0, 0, \
+                            m_rows, n_cols, &loc_x_vec, &h5_ids[0], \
+                            &dset_id, fspace_offsets[thread_num], rank, thread_num);
+
+        std::size_t initial_offset = thread_num*m_rows;
+        double diff_val;
+        for (std::size_t k = 0; k < m_rows; k++){
+            diff_val = fabs(thread_loc_B[k] - loc_b_vec[initial_offset + k]);
+            if (diff_val > 1e-6) printf("Error! Diff: %f\n", diff_val);
+        }
+        free(thread_loc_B);
+    }
+  
+    double sum_loc_b = 0;
+    for (std::size_t i = 0; i < local_mat_sizes[0]; i++){
+        sum_loc_b += loc_b_vec[i];
+    }
+    printf("rank %d, sum_loc_b: %f\n", rank, sum_loc_b);
+    
+   // ================= Write x & b vector per process ====================
+
+    free(loc_x_vec);
+    free(loc_b_vec);
+
+    free(Mat_1D_lens);
+
+    close_H5_dataset_L(&dset_id);
+    close_H5_dataset_L(&x_vec_id);
+    close_H5_dataset_L(&b_vec_id);
+    close_H5_file_L(&h5_ids[0]);
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/examples/sync_read_testbed2.cpp b/examples/sync_read_testbed2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..409ff464f6482ef69c97e1be17224a63fc127ae0
--- /dev/null
+++ b/examples/sync_read_testbed2.cpp
@@ -0,0 +1,301 @@
+#include <mpi.h>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <omp.h>
+
+#include "./pariox_h5.hpp"
+#include <Random123/philox.h>
+#include <Random123/uniform.hpp>
+
+int generate_2D_mat(double*** matrix, std::size_t m, std::size_t n, std::size_t locality){
+    typedef r123::Philox4x32 RNG;
+    RNG rng;
+    RNG::ctr_type c={{}};
+    RNG::ukey_type uk={{}};
+    uk[0] = locality; // some user_supplied_seed
+    RNG::key_type k=uk;
+    RNG::ctr_type r;
+            
+    c[0] = 15112024;
+    c[1] = 16122024;
+
+    std::srand(locality * 15112024);
+
+    *matrix = (double **)calloc(m, sizeof(double*));
+    for (std::size_t i = 0; i < m; i++){
+        (*matrix)[i] = (double *)calloc(n, sizeof(double));
+    } 
+
+    for (std::size_t i = 0; i < m; i++){
+        for (std::size_t j = 0; j < n; j++){
+            c[0] += 1;
+            c[1] += 1;
+            r = rng(c, k);
+            (*matrix)[i][j] = r123::u01<double>(r.v[0]); //static_cast<double>(std::rand()) / RAND_MAX;
+            //(*matrix)[i][j] = static_cast<double>(std::rand()) / RAND_MAX;
+        }
+    }
+    return 0;
+}
+
+int generate_vec(double **x_vec, std::size_t n, std::size_t locality){
+    std::srand(241119);
+
+    *x_vec = (double *)calloc(n, sizeof(double*));
+    for (std::size_t i = 0; i < n; i++){
+        (*x_vec)[i] = static_cast<double>(std::rand()) / RAND_MAX;
+    }
+    return 0;
+}
+
+double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \
+                            std::size_t m_rows, std::size_t n_cols, double *loc_X_vec, hid_t *h5_ids, \
+                            hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \
+                            std::size_t task_instance){
+    //printf("ROW: %u, COL: %u, loc_A_mat: %f %f %f x_vec: %f %f %f\n", row_offset, col_offset,\
+                                           (*loc_A_mat)[row_offset][0], (*loc_A_mat)[row_offset][1], (*loc_A_mat)[row_offset][2],\
+                                           loc_X_vec[0], loc_X_vec[1], loc_X_vec[2]);
+    struct timespec start, end, io_start, io_end, mat_prep_start, mat_prep_end;
+    double ** task_A_mat;
+    std::size_t seed = locality_id + m_rows*n_cols*task_instance;
+
+    clock_gettime(CLOCK_MONOTONIC, &mat_prep_start);
+    generate_2D_mat(&task_A_mat, m_rows, n_cols, seed);
+
+    std::size_t task_size = m_rows*n_cols;
+    double *write_buffer;
+    std::size_t file_idx;
+    write_buffer = (double*)calloc(task_size, sizeof(double));
+    for (std::size_t i = 0; i < m_rows; i++){
+        for (std::size_t j = 0; j < n_cols; j++){
+            file_idx = i*n_cols + j;
+            write_buffer[file_idx] = task_A_mat[i][j];
+        }
+    }
+    clock_gettime(CLOCK_MONOTONIC, &mat_prep_end);
+
+    clock_gettime(CLOCK_MONOTONIC, &start);
+
+    // Compute the mat * vec
+    double *task_b_vec;
+    task_b_vec = (double *)calloc(m_rows,sizeof(double));
+    
+    for (std::size_t i = 0; i < m_rows; i++){
+        task_b_vec[i] = 0;
+        //task_b_vec[i] = vec_mult(loc_A_mat[row_offset + i][0], loc_X_vec, n_cols);
+        for (std::size_t j = 0; j < n_cols; j++){
+            file_idx = i*n_cols + j;
+            task_b_vec[i] = task_b_vec[i] + write_buffer[file_idx]*loc_X_vec[j];
+            // task_b_vec[i] = task_b_vec[i] + task_A_mat[i][j]*loc_X_vec[j];
+        }
+        //printf("task_b_vec[%u] = %f\n", i, task_b_vec[i]);
+    }
+
+    for (std::size_t i = 0; i < m_rows; i++){
+        free(task_A_mat[i]);
+    }
+    free(task_A_mat);
+
+    hid_t datatype = H5Dget_type(dset_ids[0]);
+    hid_t dataspace = H5Dget_space(dset_ids[0]);// the in-file dataset's dataspace.
+    int ndims = H5Sget_simple_extent_ndims(dataspace);
+
+    hid_t dummy_space_id = H5Screate_simple(ndims, &task_size, NULL); // describes memory data space
+    H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, &fspace_offset, NULL, &task_size, NULL);
+
+    // write the task local matrix entries.
+    clock_gettime(CLOCK_MONOTONIC, &io_start);
+    //write_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_size, &write_buffer[0]);
+    H5Dwrite(dset_ids[0], datatype, dummy_space_id, dataspace, h5_ids[3], write_buffer);
+    clock_gettime(CLOCK_MONOTONIC, &io_end);
+    free(write_buffer);
+
+    clock_gettime(CLOCK_MONOTONIC, &end);
+
+    double p_mat_start = mat_prep_start.tv_sec*1.0 + mat_prep_start.tv_nsec*1e-9;
+    double p_mat_end = mat_prep_end.tv_sec*1.0 + mat_prep_end.tv_nsec*1e-9;
+    double diff_mat_d = p_mat_end - p_mat_start;
+
+    double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9;
+    double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9;
+    double diff_d = p_end - p_start;
+
+    double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9;
+    double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9;
+    double diff_io_d = p_io_end - p_io_start;
+    double bytes_tf_rate = (task_size*8)/(diff_io_d*1024*1024);
+    printf("LOC: %4u, Task: %5u, Start: %f, End: %f, Mat Dur: %f, I/O Dur: %f, SUB Dur: %f, Write_rate: %f MB/s\n", \
+                     locality_id, task_instance, p_io_start, p_io_end, diff_mat_d, diff_io_d, diff_d, bytes_tf_rate);
+
+    return task_b_vec;
+}
+
+double* task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \
+                               std::size_t m_rows, std::size_t n_cols, double **loc_X_vec, hid_t *h5_ids, \
+                               hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \
+                               std::size_t task_instance){
+    struct timespec start, end, io_start, io_end;
+    clock_gettime(CLOCK_MONOTONIC, &start);
+
+    double *task_b_vec;
+    task_b_vec = (double *)calloc(m_rows, sizeof(double));
+
+    double **mat_buffer;
+    std::size_t dim_lens[2];
+    dim_lens[0] = m_rows;
+    dim_lens[1] = n_cols;
+    clock_gettime(CLOCK_MONOTONIC, &io_start);
+    read_and_unflatten_2D(h5_ids, dset_ids, fspace_offset, &dim_lens[0], &mat_buffer);
+    clock_gettime(CLOCK_MONOTONIC, &io_end);
+
+    std::size_t task_local_sizes = m_rows*n_cols;
+    // Compute!
+    for (std::size_t i = 0; i < m_rows; i++){
+        for (std::size_t j = 0; j < n_cols; j++){
+            task_b_vec[i] = task_b_vec[i] + mat_buffer[i][j] * (*loc_X_vec)[j];
+        }
+    }
+    
+    for(std::size_t i = 0; i < dim_lens[0]; i++){
+        free(mat_buffer[i]);
+    }
+    free(mat_buffer);
+    
+    clock_gettime(CLOCK_MONOTONIC, &end);
+    double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9;
+    double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9;
+    double diff_d = p_end - p_start;
+
+    double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9;
+    double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9;
+    double diff_io_d = p_io_end - p_io_start;
+    double bytes_tf_rate = (task_local_sizes*8)/(diff_io_d*1024*1024);
+    printf("LOC: %4u, Task: %5u Start: %f, End: %f, I/O Dur: %f, SUB Dur: %f, Read_rate: %f MB/s\n", locality_id, task_instance, p_io_start, p_io_end, diff_io_d, diff_d, bytes_tf_rate);
+
+    return task_b_vec;
+}
+
+int main(int argc, char **argv){
+    std::size_t tot_mat_rows, tot_mat_cols;
+    std::size_t remainder_rows, loc_mat_rows;
+    std::size_t rows_per_task;
+    if (argc < 3){
+        printf("Error: Insufficient number of command line argument. \n \
+                Usage: mpiexec -n 2 <executable> <int value for matrix's first dimension lenght> <int value for rows per task>\n");
+        exit(0);
+    }    
+    sscanf(argv[1], "%zu", &tot_mat_rows);
+    sscanf(argv[2], "%zu", &rows_per_task);
+    tot_mat_cols = tot_mat_rows;
+   // ================= Generate x vector=====================
+    std::vector<hid_t> dset_ids(3);
+
+    std::size_t m_rows, n_cols;
+    double **loc_A_mat;
+    int rank, nprocs, num_threads;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    num_threads = omp_get_num_threads();
+
+    // =========================== Open a H5 file =========================
+    std::size_t ndims;
+    std::vector<hid_t> h5_ids;
+    hid_t data_id, dtype_id, attr_id, dset_id, x_vec_id, b_vec_id;
+
+    h5_ids = open_H5_file_L(MPI_COMM_WORLD, "./mat_mult_demo_ompi.h5");
+
+    dtype_id = H5T_NATIVE_DOUBLE;
+
+    std::size_t *Mat_1D_lens;
+    std::vector<std::size_t> total_mat_sizes(2);
+    std::vector<std::size_t> local_mat_sizes(2);
+    std::size_t loc_size_1D;
+    dset_id = access_H5_dataset_L(&h5_ids[0], "/Mat_entries", &dtype_id, &ndims, &Mat_1D_lens);
+
+    hid_t attr_dtype_id = H5T_NATIVE_UINT;
+    access_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_cols", attr_dtype_id, &total_mat_sizes[1]);
+    access_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_rows", attr_dtype_id, &total_mat_sizes[0]);
+
+    local_mat_sizes[0] = total_mat_sizes[0]/nprocs;
+    local_mat_sizes[1] = total_mat_sizes[1];
+    loc_size_1D = local_mat_sizes[0]*local_mat_sizes[1];
+
+    std::size_t *dummy_lens; 
+    std::size_t dummy_ndims;
+    x_vec_id = access_H5_dataset_L(&h5_ids[0], "/X_vec", &dtype_id, &dummy_ndims, &dummy_lens);
+    b_vec_id = access_H5_dataset_L(&h5_ids[0], "/B_vec", &dtype_id, &dummy_ndims, &dummy_lens);
+    
+    m_rows = static_cast<int>(rows_per_task);
+    n_cols = total_mat_sizes[1];
+    double *loc_b_vec, *loc_x_vec;
+    loc_b_vec = (double *)calloc(local_mat_sizes[0], sizeof(double));
+    loc_x_vec = (double *)calloc(local_mat_sizes[1], sizeof(double));
+    free(dummy_lens);
+
+    std::size_t x_fspace_id = 0;
+    read_H5_ndim_data_L<double>(&h5_ids[0], &x_vec_id, &x_fspace_id, &local_mat_sizes[1], &loc_x_vec[0]);
+    x_fspace_id = rank*local_mat_sizes[0];
+    read_H5_ndim_data_L<double>(&h5_ids[0], &b_vec_id, &x_fspace_id, &local_mat_sizes[0], &loc_b_vec[0]);
+    // ================= Pre-compute process & thread offsets==============
+    int num_chunks;
+    num_chunks = local_mat_sizes[0]/rows_per_task;
+    std::vector<std::size_t> fspace_offsets(num_chunks);
+    index_offset_calc(ndims, &loc_size_1D, &Mat_1D_lens[0], rank, &fspace_offsets[0]);
+
+    std::vector<std::size_t> row_offsets(num_chunks);
+    std::size_t task_dim_lens = rows_per_task*total_mat_sizes[1];
+    std::size_t temp;
+    for (std::size_t j = 0; j < num_chunks; j++){
+        index_offset_calc(ndims, &task_dim_lens, &loc_size_1D, j, &temp);
+        fspace_offsets[j] = fspace_offsets[0] + temp;
+        row_offsets[j] = j*rows_per_task; 
+        // printf("Rank: %d, thread: %d, offset: %zu %zu %zu\n", rank, j, fspace_offsets[j], m_rows, n_cols);
+    }
+
+    #pragma omp parallel
+    {
+        int thread_num;
+        thread_num = omp_get_thread_num();
+
+        double *thread_loc_B; 
+        //thread_loc_B = task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \
+                               std::size_t m_rows, std::size_t n_cols, double **loc_X_vec, hid_t *h5_ids, \
+                               hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \
+                               std::size_t task_instance){
+        thread_loc_B = task_compute_read_work(&loc_A_mat, 0, 0, \
+                            m_rows, n_cols, &loc_x_vec, &h5_ids[0], \
+                            &dset_id, fspace_offsets[thread_num], rank, thread_num);
+
+        std::size_t initial_offset = thread_num*m_rows;
+        double diff_val;
+        for (std::size_t k = 0; k < m_rows; k++){
+            diff_val = fabs(thread_loc_B[k] - loc_b_vec[initial_offset + k]);
+            if (diff_val > 1e-6) printf("Error! Diff: %f\n", diff_val);
+        }
+        free(thread_loc_B);
+    }
+  
+    double sum_loc_b = 0;
+    for (std::size_t i = 0; i < local_mat_sizes[0]; i++){
+        sum_loc_b += loc_b_vec[i];
+    }
+    printf("rank %d, sum_loc_b: %f\n", rank, sum_loc_b);
+    
+   // ================= Write x & b vector per process ====================
+
+    free(loc_x_vec);
+    free(loc_b_vec);
+
+    free(Mat_1D_lens);
+
+    close_H5_dataset_L(&dset_id);
+    close_H5_dataset_L(&x_vec_id);
+    close_H5_dataset_L(&b_vec_id);
+    close_H5_file_L(&h5_ids[0]);
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/examples/sync_write_testbed.cpp b/examples/sync_write_testbed.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a1f838bf6935b063c1f6973a7f3cc884d0f6047
--- /dev/null
+++ b/examples/sync_write_testbed.cpp
@@ -0,0 +1,316 @@
+#include <mpi.h>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <omp.h>
+
+#include "./pariox_h5.hpp"
+#include <Random123/philox.h>
+#include <Random123/uniform.hpp>
+
+int generate_2D_mat(double*** matrix, std::size_t m, std::size_t n, std::size_t locality){
+    typedef r123::Philox4x32 RNG;
+    RNG rng;
+    RNG::ctr_type c={{}};
+    RNG::ukey_type uk={{}};
+    uk[0] = locality; // some user_supplied_seed
+    RNG::key_type k=uk;
+    RNG::ctr_type r;
+            
+    c[0] = 15112024;
+    c[1] = 16122024;
+
+    std::srand(locality * 15112024);
+
+    *matrix = (double **)calloc(m, sizeof(double*));
+    for (std::size_t i = 0; i < m; i++){
+        (*matrix)[i] = (double *)calloc(n, sizeof(double));
+    } 
+
+    for (std::size_t i = 0; i < m; i++){
+        for (std::size_t j = 0; j < n; j++){
+            c[0] += 1;
+            c[1] += 1;
+            r = rng(c, k);
+            (*matrix)[i][j] = r123::u01<double>(r.v[0]); //static_cast<double>(std::rand()) / RAND_MAX;
+            //(*matrix)[i][j] = static_cast<double>(std::rand()) / RAND_MAX;
+        }
+    }
+    return 0;
+}
+
+int generate_vec(double **x_vec, std::size_t n, std::size_t locality){
+    std::srand(241119);
+
+    *x_vec = (double *)calloc(n, sizeof(double*));
+    for (std::size_t i = 0; i < n; i++){
+        (*x_vec)[i] = static_cast<double>(std::rand()) / RAND_MAX;
+    }
+    return 0;
+}
+
+double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \
+                            std::size_t m_rows, std::size_t n_cols, double *loc_X_vec, hid_t *h5_ids, \
+                            hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \
+                            std::size_t task_instance){
+    //printf("ROW: %u, COL: %u, loc_A_mat: %f %f %f x_vec: %f %f %f\n", row_offset, col_offset,\
+                                           (*loc_A_mat)[row_offset][0], (*loc_A_mat)[row_offset][1], (*loc_A_mat)[row_offset][2],\
+                                           loc_X_vec[0], loc_X_vec[1], loc_X_vec[2]);
+    struct timespec start, end, io_start, io_end, mat_prep_start, mat_prep_end;
+    double ** task_A_mat;
+    std::size_t seed = locality_id + m_rows*n_cols*task_instance;
+
+    clock_gettime(CLOCK_MONOTONIC, &mat_prep_start);
+    generate_2D_mat(&task_A_mat, m_rows, n_cols, seed);
+
+    std::size_t task_size = m_rows*n_cols;
+    std::size_t file_idx;
+    double *write_buffer;
+
+    //======================try out flattening functions===================
+    double **ptr_of_pointers;
+    std::size_t dim_lens[2];
+    dim_lens[0] = m_rows;
+    dim_lens[1] = n_cols;
+    prep_pointers(&dim_lens[0], &ptr_of_pointers, task_A_mat);
+    flatten(2, &dim_lens[0], ptr_of_pointers, &write_buffer);
+    free(ptr_of_pointers);
+    //======================================================================
+    //====================Normal way of flattening==========================
+/*    write_buffer = (double*)calloc(task_size, sizeof(double));
+    for (std::size_t i = 0; i < m_rows; i++){
+        for (std::size_t j = 0; j < n_cols; j++){
+            file_idx = i*n_cols + j;
+            write_buffer[file_idx] = task_A_mat[i][j];
+        }
+    }*/
+    //=======================================================================
+    clock_gettime(CLOCK_MONOTONIC, &mat_prep_end);
+
+    clock_gettime(CLOCK_MONOTONIC, &start);
+
+    // Compute the mat * vec
+    double *task_b_vec;
+    task_b_vec = (double *)calloc(m_rows,sizeof(double));
+    
+    for (std::size_t i = 0; i < m_rows; i++){
+        task_b_vec[i] = 0;
+        //task_b_vec[i] = vec_mult(loc_A_mat[row_offset + i][0], loc_X_vec, n_cols);
+        for (std::size_t j = 0; j < n_cols; j++){
+            file_idx = i*n_cols + j;
+            task_b_vec[i] = task_b_vec[i] + write_buffer[file_idx]*loc_X_vec[j];
+            // task_b_vec[i] = task_b_vec[i] + task_A_mat[i][j]*loc_X_vec[j];
+        }
+        //printf("task_b_vec[%u] = %f\n", i, task_b_vec[i]);
+    }
+
+    for (std::size_t i = 0; i < m_rows; i++){
+        free(task_A_mat[i]);
+    }
+    free(task_A_mat);
+
+    hid_t datatype = H5Dget_type(dset_ids[0]);
+    hid_t dataspace = H5Dget_space(dset_ids[0]);// the in-file dataset's dataspace.
+    int ndims = H5Sget_simple_extent_ndims(dataspace);
+
+    hid_t dummy_space_id = H5Screate_simple(ndims, &task_size, NULL); // describes memory data space
+    H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, &fspace_offset, NULL, &task_size, NULL);
+
+    // write the task local matrix entries.
+    clock_gettime(CLOCK_MONOTONIC, &io_start);
+    //write_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_size, &write_buffer[0]);
+    H5Dwrite(dset_ids[0], datatype, dummy_space_id, dataspace, h5_ids[3], write_buffer);
+    clock_gettime(CLOCK_MONOTONIC, &io_end);
+    free(write_buffer);
+
+    clock_gettime(CLOCK_MONOTONIC, &end);
+
+    double p_mat_start = mat_prep_start.tv_sec*1.0 + mat_prep_start.tv_nsec*1e-9;
+    double p_mat_end = mat_prep_end.tv_sec*1.0 + mat_prep_end.tv_nsec*1e-9;
+    double diff_mat_d = p_mat_end - p_mat_start;
+
+    double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9;
+    double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9;
+    double diff_d = p_end - p_start;
+
+    double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9;
+    double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9;
+    double diff_io_d = p_io_end - p_io_start;
+    double bytes_tf_rate = (task_size*8)/(diff_io_d*1024*1024);
+    printf("LOC: %4u, Task: %5u, Start: %f, End: %f, Mat Dur: %f, I/O Dur: %f, SUB Dur: %f, Write_rate: %f MB/s\n", \
+                     locality_id, task_instance, p_io_start, p_io_end, diff_mat_d, diff_io_d, diff_d, bytes_tf_rate);
+
+    return task_b_vec;
+}
+
+double* task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \
+                               std::size_t m_rows, std::size_t n_cols, double **loc_X_vec, hid_t *h5_ids, \
+                               hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \
+                               std::size_t task_instance){
+    struct timespec start, end, io_start, io_end;
+    clock_gettime(CLOCK_MONOTONIC, &start);
+
+    double *task_b_vec;
+    task_b_vec = (double *)calloc(m_rows, sizeof(double));
+
+    std::size_t task_local_sizes = m_rows*n_cols;
+    double *write_buffer;
+    write_buffer = (double*)calloc(task_local_sizes, sizeof(double));
+
+    clock_gettime(CLOCK_MONOTONIC, &io_start);
+    read_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_local_sizes, &write_buffer[0]);
+    clock_gettime(CLOCK_MONOTONIC, &io_end);
+
+    std::size_t file_idx = 0;
+    /*for (std::size_t i = 0; i < m_rows; i++){
+        for (std::size_t j = 0; j < n_cols; j++){
+            file_idx = i*n_cols + j;
+            (*loc_A_mat)[row_offset + i][j] = write_buffer[file_idx];
+        }
+    }
+   
+    for (std::size_t i = 0; i < m_rows; i++){
+        //printf("Row_off: %u, %u, write_buff: %f %f %f %f\n", row_offset, fspace_offset, \
+                                             write_buffer[0], write_buffer[1], \
+                                             write_buffer[2], write_buffer[3]);
+        printf("Row_off: %u, %u, Task A mat: %f %f %f %f\n", row_offset, fspace_offset,\
+                                            (*loc_A_mat)[row_offset + i][0], \
+                                            (*loc_A_mat)[row_offset + i][1], \
+                                            (*loc_A_mat)[row_offset + i][2], \
+                                            (*loc_A_mat)[row_offset + i][3]);
+    }
+    printf("x_vec: %f %f ... %f %f\n", (*loc_X_vec)[0], (*loc_X_vec)[1], (*loc_X_vec)[n_cols-2], \
+                                       (*loc_X_vec)[n_cols-1]);*/
+    // Compute!
+    for (std::size_t i = 0; i < m_rows; i++){
+        for (std::size_t j = 0; j < n_cols; j++){
+            file_idx = i*n_cols + j;
+            task_b_vec[i] = task_b_vec[i] + write_buffer[file_idx] * (*loc_X_vec)[j];
+        }
+    }
+    free(write_buffer);
+    clock_gettime(CLOCK_MONOTONIC, &end);
+    double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9;
+    double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9;
+    double diff_d = p_end - p_start;
+
+    double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9;
+    double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9;
+    double diff_io_d = p_io_end - p_io_start;
+    double bytes_tf_rate = (task_local_sizes*8)/(diff_io_d*1024*1024);
+    printf("LOC: %4u, Task: %5u Start: %f, End: %f, I/O Dur: %f, SUB Dur: %f, Read_rate: %f MB/s\n", locality_id, task_instance, p_io_start, p_io_end, diff_io_d, diff_d, bytes_tf_rate);
+
+    return task_b_vec;
+}
+
+int main(int argc, char **argv){
+    std::size_t tot_mat_rows, tot_mat_cols;
+    std::size_t remainder_rows, loc_mat_rows;
+    std::size_t rows_per_task;
+    if (argc < 3){
+        printf("Error: Insufficient number of command line argument. \n \
+                Usage: mpiexec -n 2 <executable> <int value for matrix's first dimension lenght> <int value for rows per task>\n");
+        exit(0);
+    }    
+    sscanf(argv[1], "%zu", &tot_mat_rows);
+    sscanf(argv[2], "%zu", &rows_per_task);
+    tot_mat_cols = tot_mat_rows;
+   // ================= Generate x vector=====================
+    double *loc_x_vec;
+    generate_vec(&loc_x_vec, tot_mat_rows, 0);
+
+    std::size_t m_rows, n_cols;
+    double **loc_A_mat;
+    int rank, nprocs, num_threads;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    num_threads = omp_get_num_threads();
+    loc_mat_rows = tot_mat_rows/nprocs;
+    m_rows = static_cast<int>(rows_per_task);
+    n_cols = tot_mat_rows;
+
+    double *loc_b_vec;
+    loc_b_vec = (double *)calloc(loc_mat_rows, sizeof(double));
+    // =========================== Open a H5 file =========================
+    int ndims;
+    std::size_t tot_size_1D;
+    tot_size_1D = tot_mat_cols*tot_mat_rows;
+    std::vector<hid_t> h5_ids;
+    hid_t data_id, dtype_id, attr_id, dset_id, x_vec_id, b_vec_id;
+
+    h5_ids = prep_H5_file_L(MPI_COMM_WORLD, "./mat_mult_demo_ompi.h5");
+
+    dtype_id = H5T_NATIVE_DOUBLE;
+    
+    ndims = 1;
+    dset_id = attach_H5_dataset_L(&h5_ids[0], "Mat_entries", dtype_id, ndims, &tot_size_1D);
+    x_vec_id = attach_H5_dataset_L(&h5_ids[0], "X_vec", dtype_id, ndims, &tot_mat_cols);
+    b_vec_id = attach_H5_dataset_L(&h5_ids[0], "B_vec", dtype_id, ndims, &tot_mat_cols);
+
+    dtype_id = H5T_NATIVE_UINT;
+    attach_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_rows", dtype_id, &tot_mat_rows);
+    attach_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_cols", dtype_id, &tot_mat_cols);
+    attach_H5_scalar_attr_L<std::size_t>(&x_vec_id, "Total_rows", dtype_id, &tot_mat_cols);
+    attach_H5_scalar_attr_L<std::size_t>(&b_vec_id, "Total_rows", dtype_id, &tot_mat_cols);
+
+
+    // ================= Pre-compute process & thread offsets==============
+    std::size_t loc_size_1D;
+    int num_chunks;
+    num_chunks = loc_mat_rows/rows_per_task;
+    std::vector<std::size_t> fspace_offsets(num_chunks);
+    loc_size_1D = tot_mat_cols*loc_mat_rows;
+    index_offset_calc(ndims, &loc_size_1D, &tot_size_1D, rank, &fspace_offsets[0]);
+
+    std::vector<std::size_t> row_offsets(num_chunks);
+    std::size_t task_dim_lens = rows_per_task*tot_mat_cols;
+    std::size_t temp;
+    for (std::size_t j = 0; j < num_chunks; j++){
+        index_offset_calc(ndims, &task_dim_lens, &loc_size_1D, j, &temp);
+        fspace_offsets[j] = fspace_offsets[0] + temp;
+        row_offsets[j] = j*rows_per_task; 
+        // printf("Rank: %d, thread: %d, offset: %zu %zu %zu\n", rank, j, fspace_offsets[j], m_rows, n_cols);
+    }
+
+    #pragma omp parallel
+    {
+        int thread_num;
+        thread_num = omp_get_thread_num();
+
+        double *thread_loc_B; 
+        thread_loc_B = task_compute_write_work(&loc_A_mat, 0, 0, \
+                            m_rows, n_cols, &loc_x_vec[0], &h5_ids[0], \
+                            &dset_id, fspace_offsets[thread_num], rank, thread_num);
+
+        std::size_t initial_offset = thread_num*m_rows;
+        for (std::size_t k = 0; k < m_rows; k++){
+            loc_b_vec[initial_offset + k] = thread_loc_B[k];
+        }
+        free(thread_loc_B);
+    }
+    
+    double sum_loc_b = 0;
+    for (std::size_t i = 0; i < loc_mat_rows; i++){
+        sum_loc_b += loc_b_vec[i];
+    }
+    printf("rank %d, sum_loc_b: %f\n", rank, sum_loc_b);
+
+    ndims = 1; 
+    index_offset_calc(ndims, &loc_mat_rows, &tot_mat_rows, rank, &fspace_offsets[0]);
+    write_H5_ndim_data_L(&h5_ids[0], &b_vec_id, &fspace_offsets[0], &loc_mat_rows, &loc_b_vec[0]);
+    write_H5_ndim_data_L(&h5_ids[0], &x_vec_id, &fspace_offsets[0], &loc_mat_rows, &loc_x_vec[fspace_offsets[0]]);
+   // ================= Write x & b vector per process ====================
+
+    free(loc_x_vec);
+    free(loc_b_vec);
+
+    close_H5_dataset_L(&dset_id);
+    close_H5_dataset_L(&x_vec_id);
+    close_H5_dataset_L(&b_vec_id);
+    close_H5_file_L(&h5_ids[0]);
+   
+    MPI_Finalize();
+    return 0;
+}
diff --git a/examples/sync_write_testbed2.cpp b/examples/sync_write_testbed2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c27382d8fcb894f9dc573812654a63ddfdb1425
--- /dev/null
+++ b/examples/sync_write_testbed2.cpp
@@ -0,0 +1,292 @@
+#include <mpi.h>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <omp.h>
+
+#include "./pariox_h5.hpp"
+#include <Random123/philox.h>
+#include <Random123/uniform.hpp>
+
+int generate_2D_mat(double*** matrix, std::size_t m, std::size_t n, std::size_t locality){
+    typedef r123::Philox4x32 RNG;
+    RNG rng;
+    RNG::ctr_type c={{}};
+    RNG::ukey_type uk={{}};
+    uk[0] = locality; // some user_supplied_seed
+    RNG::key_type k=uk;
+    RNG::ctr_type r;
+            
+    c[0] = 15112024;
+    c[1] = 16122024;
+
+    std::srand(locality * 15112024);
+
+    *matrix = (double **)calloc(m, sizeof(double*));
+    for (std::size_t i = 0; i < m; i++){
+        (*matrix)[i] = (double *)calloc(n, sizeof(double));
+    } 
+
+    for (std::size_t i = 0; i < m; i++){
+        for (std::size_t j = 0; j < n; j++){
+            c[0] += 1;
+            c[1] += 1;
+            r = rng(c, k);
+            (*matrix)[i][j] = r123::u01<double>(r.v[0]); //static_cast<double>(std::rand()) / RAND_MAX;
+            //(*matrix)[i][j] = static_cast<double>(std::rand()) / RAND_MAX;
+        }
+    }
+    return 0;
+}
+
+int generate_vec(double **x_vec, std::size_t n, std::size_t locality){
+    std::srand(241119);
+
+    *x_vec = (double *)calloc(n, sizeof(double*));
+    for (std::size_t i = 0; i < n; i++){
+        (*x_vec)[i] = static_cast<double>(std::rand()) / RAND_MAX;
+    }
+    return 0;
+}
+
+double* task_compute_write_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \
+                            std::size_t m_rows, std::size_t n_cols, double *loc_X_vec, hid_t *h5_ids, \
+                            hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \
+                            std::size_t task_instance){
+    //printf("ROW: %u, COL: %u, loc_A_mat: %f %f %f x_vec: %f %f %f\n", row_offset, col_offset,\
+                                           (*loc_A_mat)[row_offset][0], (*loc_A_mat)[row_offset][1], (*loc_A_mat)[row_offset][2],\
+                                           loc_X_vec[0], loc_X_vec[1], loc_X_vec[2]);
+    struct timespec start, end, io_start, io_end, mat_prep_start, mat_prep_end;
+    double ** task_A_mat;
+    std::size_t seed = locality_id + m_rows*n_cols*task_instance;
+
+    clock_gettime(CLOCK_MONOTONIC, &mat_prep_start);
+    generate_2D_mat(&task_A_mat, m_rows, n_cols, seed);
+
+    std::size_t task_size = m_rows*n_cols;
+
+    //======================try out flattening functions===================
+    std::size_t dim_lens[2];
+    dim_lens[0] = m_rows;
+    dim_lens[1] = n_cols;
+    //======================================================================
+    clock_gettime(CLOCK_MONOTONIC, &mat_prep_end);
+
+    clock_gettime(CLOCK_MONOTONIC, &start);
+
+    // Compute the mat * vec
+    double *task_b_vec;
+    task_b_vec = (double *)calloc(m_rows,sizeof(double));
+    
+    for (std::size_t i = 0; i < m_rows; i++){
+        task_b_vec[i] = 0;
+        //task_b_vec[i] = vec_mult(loc_A_mat[row_offset + i][0], loc_X_vec, n_cols);
+        for (std::size_t j = 0; j < n_cols; j++){
+            task_b_vec[i] = task_b_vec[i] + task_A_mat[i][j]*loc_X_vec[j];
+        }
+        //printf("task_b_vec[%u] = %f\n", i, task_b_vec[i]);
+    }
+
+    //============================Alternative 1D write============================
+    clock_gettime(CLOCK_MONOTONIC, &io_start);
+    flatten_and_write_2D(h5_ids, dset_ids, fspace_offset, dim_lens, &task_A_mat);
+    clock_gettime(CLOCK_MONOTONIC, &io_end);
+
+    //============================================================================
+
+    clock_gettime(CLOCK_MONOTONIC, &end);
+
+    double p_mat_start = mat_prep_start.tv_sec*1.0 + mat_prep_start.tv_nsec*1e-9;
+    double p_mat_end = mat_prep_end.tv_sec*1.0 + mat_prep_end.tv_nsec*1e-9;
+    double diff_mat_d = p_mat_end - p_mat_start;
+
+    double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9;
+    double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9;
+    double diff_d = p_end - p_start;
+
+    double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9;
+    double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9;
+    double diff_io_d = p_io_end - p_io_start;
+    double bytes_tf_rate = (task_size*8)/(diff_io_d*1024*1024);
+    printf("LOC: %4u, Task: %5u, Start: %f, End: %f, Mat Dur: %f, I/O Dur: %f, SUB Dur: %f, Write_rate: %f MB/s\n", \
+                     locality_id, task_instance, p_io_start, p_io_end, diff_mat_d, diff_io_d, diff_d, bytes_tf_rate);
+
+    for (std::size_t i = 0; i < m_rows; i++){
+        free(task_A_mat[i]);
+    }
+    free(task_A_mat);
+
+    return task_b_vec;
+}
+
+double* task_compute_read_work(double ***loc_A_mat, std::size_t row_offset, std::size_t col_offset, \
+                               std::size_t m_rows, std::size_t n_cols, double **loc_X_vec, hid_t *h5_ids, \
+                               hid_t *dset_ids, std::size_t fspace_offset, std::size_t locality_id, \
+                               std::size_t task_instance){
+    struct timespec start, end, io_start, io_end;
+    clock_gettime(CLOCK_MONOTONIC, &start);
+
+    double *task_b_vec;
+    task_b_vec = (double *)calloc(m_rows, sizeof(double));
+
+    std::size_t task_local_sizes = m_rows*n_cols;
+    double *write_buffer;
+    write_buffer = (double*)calloc(task_local_sizes, sizeof(double));
+
+    clock_gettime(CLOCK_MONOTONIC, &io_start);
+    read_H5_ndim_data_L(h5_ids, &dset_ids[0], &fspace_offset, &task_local_sizes, &write_buffer[0]);
+    clock_gettime(CLOCK_MONOTONIC, &io_end);
+
+    std::size_t file_idx = 0;
+    /*for (std::size_t i = 0; i < m_rows; i++){
+        for (std::size_t j = 0; j < n_cols; j++){
+            file_idx = i*n_cols + j;
+            (*loc_A_mat)[row_offset + i][j] = write_buffer[file_idx];
+        }
+    }
+   
+    for (std::size_t i = 0; i < m_rows; i++){
+        //printf("Row_off: %u, %u, write_buff: %f %f %f %f\n", row_offset, fspace_offset, \
+                                             write_buffer[0], write_buffer[1], \
+                                             write_buffer[2], write_buffer[3]);
+        printf("Row_off: %u, %u, Task A mat: %f %f %f %f\n", row_offset, fspace_offset,\
+                                            (*loc_A_mat)[row_offset + i][0], \
+                                            (*loc_A_mat)[row_offset + i][1], \
+                                            (*loc_A_mat)[row_offset + i][2], \
+                                            (*loc_A_mat)[row_offset + i][3]);
+    }
+    printf("x_vec: %f %f ... %f %f\n", (*loc_X_vec)[0], (*loc_X_vec)[1], (*loc_X_vec)[n_cols-2], \
+                                       (*loc_X_vec)[n_cols-1]);*/
+    // Compute!
+    for (std::size_t i = 0; i < m_rows; i++){
+        for (std::size_t j = 0; j < n_cols; j++){
+            file_idx = i*n_cols + j;
+            task_b_vec[i] = task_b_vec[i] + write_buffer[file_idx] * (*loc_X_vec)[j];
+        }
+    }
+    free(write_buffer);
+    clock_gettime(CLOCK_MONOTONIC, &end);
+    double p_start = start.tv_sec*1.0 + start.tv_nsec*1e-9;
+    double p_end = end.tv_sec*1.0 + end.tv_nsec*1e-9;
+    double diff_d = p_end - p_start;
+
+    double p_io_start = (io_start.tv_sec)*1.0 + io_start.tv_nsec*1e-9;
+    double p_io_end = (io_end.tv_sec)*1.0 + io_end.tv_nsec*1e-9;
+    double diff_io_d = p_io_end - p_io_start;
+    double bytes_tf_rate = (task_local_sizes*8)/(diff_io_d*1024*1024);
+    printf("LOC: %4u, Task: %5u Start: %f, End: %f, I/O Dur: %f, SUB Dur: %f, Read_rate: %f MB/s\n", locality_id, task_instance, p_io_start, p_io_end, diff_io_d, diff_d, bytes_tf_rate);
+
+    return task_b_vec;
+}
+
+int main(int argc, char **argv){
+    std::size_t tot_mat_rows, tot_mat_cols;
+    std::size_t remainder_rows, loc_mat_rows;
+    std::size_t rows_per_task;
+    if (argc < 3){
+        printf("Error: Insufficient number of command line argument. \n \
+                Usage: mpiexec -n 2 <executable> <int value for matrix's first dimension lenght> <int value for rows per task>\n");
+        exit(0);
+    }    
+    sscanf(argv[1], "%zu", &tot_mat_rows);
+    sscanf(argv[2], "%zu", &rows_per_task);
+    tot_mat_cols = tot_mat_rows;
+   // ================= Generate x vector=====================
+    double *loc_x_vec;
+    generate_vec(&loc_x_vec, tot_mat_rows, 0);
+
+    std::size_t m_rows, n_cols;
+    double **loc_A_mat;
+    int rank, nprocs, num_threads;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    num_threads = omp_get_num_threads();
+    loc_mat_rows = tot_mat_rows/nprocs;
+    m_rows = static_cast<int>(rows_per_task);
+    n_cols = tot_mat_rows;
+
+    double *loc_b_vec;
+    loc_b_vec = (double *)calloc(loc_mat_rows, sizeof(double));
+    // =========================== Open a H5 file =========================
+    int ndims;
+    std::size_t tot_size_1D;
+    tot_size_1D = tot_mat_cols*tot_mat_rows;
+    std::vector<hid_t> h5_ids;
+    hid_t data_id, dtype_id, attr_id, dset_id, x_vec_id, b_vec_id;
+
+    h5_ids = prep_H5_file_L(MPI_COMM_WORLD, "./mat_mult_demo_ompi.h5");
+
+    dtype_id = H5T_NATIVE_DOUBLE;
+    
+    ndims = 1;
+    dset_id = attach_H5_dataset_L(&h5_ids[0], "Mat_entries", dtype_id, ndims, &tot_size_1D);
+    x_vec_id = attach_H5_dataset_L(&h5_ids[0], "X_vec", dtype_id, ndims, &tot_mat_cols);
+    b_vec_id = attach_H5_dataset_L(&h5_ids[0], "B_vec", dtype_id, ndims, &tot_mat_cols);
+
+    dtype_id = H5T_NATIVE_UINT;
+    attach_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_rows", dtype_id, &tot_mat_rows);
+    attach_H5_scalar_attr_L<std::size_t>(&dset_id, "Total_cols", dtype_id, &tot_mat_cols);
+    attach_H5_scalar_attr_L<std::size_t>(&x_vec_id, "Total_rows", dtype_id, &tot_mat_cols);
+    attach_H5_scalar_attr_L<std::size_t>(&b_vec_id, "Total_rows", dtype_id, &tot_mat_cols);
+
+
+    // ================= Pre-compute process & thread offsets==============
+    std::size_t loc_size_1D;
+    int num_chunks;
+    num_chunks = loc_mat_rows/rows_per_task;
+    std::vector<std::size_t> fspace_offsets(num_chunks);
+    loc_size_1D = tot_mat_cols*loc_mat_rows;
+    index_offset_calc(ndims, &loc_size_1D, &tot_size_1D, rank, &fspace_offsets[0]);
+
+    std::vector<std::size_t> row_offsets(num_chunks);
+    std::size_t task_dim_lens = rows_per_task*tot_mat_cols;
+    std::size_t temp;
+    for (std::size_t j = 0; j < num_chunks; j++){
+        index_offset_calc(ndims, &task_dim_lens, &loc_size_1D, j, &temp);
+        fspace_offsets[j] = fspace_offsets[0] + temp;
+        row_offsets[j] = j*rows_per_task; 
+        // printf("Rank: %d, thread: %d, offset: %zu %zu %zu\n", rank, j, fspace_offsets[j], m_rows, n_cols);
+    }
+
+    #pragma omp parallel
+    {
+        int thread_num;
+        thread_num = omp_get_thread_num();
+
+        double *thread_loc_B; 
+        thread_loc_B = task_compute_write_work(&loc_A_mat, 0, 0, \
+                            m_rows, n_cols, &loc_x_vec[0], &h5_ids[0], \
+                            &dset_id, fspace_offsets[thread_num], rank, thread_num);
+
+        std::size_t initial_offset = thread_num*m_rows;
+        for (std::size_t k = 0; k < m_rows; k++){
+            loc_b_vec[initial_offset + k] = thread_loc_B[k];
+        }
+        free(thread_loc_B);
+    }
+    
+    double sum_loc_b = 0;
+    for (std::size_t i = 0; i < loc_mat_rows; i++){
+        sum_loc_b += loc_b_vec[i];
+    }
+    printf("rank %d, sum_loc_b: %f\n", rank, sum_loc_b);
+
+    ndims = 1; 
+    index_offset_calc(ndims, &loc_mat_rows, &tot_mat_rows, rank, &fspace_offsets[0]);
+    write_H5_ndim_data_L(&h5_ids[0], &b_vec_id, &fspace_offsets[0], &loc_mat_rows, &loc_b_vec[0]);
+    write_H5_ndim_data_L(&h5_ids[0], &x_vec_id, &fspace_offsets[0], &loc_mat_rows, &loc_x_vec[fspace_offsets[0]]);
+   // ================= Write x & b vector per process ====================
+
+    free(loc_x_vec);
+    free(loc_b_vec);
+
+    close_H5_dataset_L(&dset_id);
+    close_H5_dataset_L(&x_vec_id);
+    close_H5_dataset_L(&b_vec_id);
+    close_H5_file_L(&h5_ids[0]);
+   
+    MPI_Finalize();
+    return 0;
+}
diff --git a/src/pariox_h5.cpp b/src/pariox_h5.cpp
index 15da7d3a9618e2b03c74bc24653acf3a02298bf9..8e54facc0501b281dda04c3b800420bf8ad3b9f5 100644
--- a/src/pariox_h5.cpp
+++ b/src/pariox_h5.cpp
@@ -1,6 +1,6 @@
 #include "./pariox_h5.hpp"
 
-int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size_t *total_lens, \
+int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size_t *dim_lens, \
                       std::size_t curr_index, std::size_t *outcome){
     std::size_t remainder = 1;
     std::size_t *base;
@@ -10,7 +10,7 @@ int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size
 
         std::vector<std::size_t> unit_counts(ndims);
         for (std::size_t i = 0; i < ndims; i++){
-            unit_counts[i] = total_lens[i]/unit_chunk_lens[i];
+            unit_counts[i] = dim_lens[i]/unit_chunk_lens[i];
         }
 
         for (std::size_t i = ndims-1; i > 0; i--){
@@ -19,7 +19,7 @@ int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size
         }
 
         std::size_t i = 0;
-        while ((i < ndims-1) && (remainder != 0)){
+        while (i < ndims-1){
             outcome[i] = floor(curr_index/base[i]);
             remainder = curr_index%base[i];
             curr_index = remainder;
@@ -34,8 +34,9 @@ int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size
         for (std::size_t i = 0; i < ndims; i++){
             outcome[i] *= unit_chunk_lens[i];
         }
+        free(base);
     } else if (ndims == 1){
-        std::size_t unit_count = *total_lens/ *unit_chunk_lens;
+        std::size_t unit_count = *dim_lens/ *unit_chunk_lens;
         *outcome = (curr_index%unit_count) * (*unit_chunk_lens);
     }
     return 0;
@@ -43,11 +44,19 @@ int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size
 
 std::vector<hid_t> prep_H5_file_L(MPI_Comm input_comm, const char* file_name){
     std::vector<hid_t> h5_ids;
+    hbool_t file_locking, dummy_bool;
     h5_ids.resize(4);
+    hid_t dummy_file_handle;
 
     h5_ids[0] = H5Pcreate(H5P_FILE_ACCESS);
     H5Pset_fapl_mpio(h5_ids[0], input_comm, MPI_INFO_NULL);
 
+    H5Pget_file_locking(h5_ids[0], &file_locking, &dummy_bool);
+    if (file_locking != 0){
+        printf("File locking was TRUE.\n");
+        H5Pset_file_locking(h5_ids[0], 0, 1);
+    }
+
     h5_ids[1] = H5Pcreate(H5P_FILE_CREATE);
     h5_ids[2] = H5Fcreate(file_name, H5F_ACC_TRUNC, h5_ids[1], h5_ids[0]);
 
diff --git a/src/pariox_h5.hpp b/src/pariox_h5.hpp
index 02095d5244780359bb3682bab3f1bf338a0f1d20..4693448a58218699caf89ecd0cf058e50a6a9834 100644
--- a/src/pariox_h5.hpp
+++ b/src/pariox_h5.hpp
@@ -13,9 +13,230 @@
 #include <cmath>
 #include <vector>
 
-int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size_t *total_lens, \
+int index_offset_calc(std::size_t ndims, std::size_t *unit_chunk_lens, std::size_t *dim_lens, \
                       std::size_t curr_index, std::size_t *outcome);
 
+template <typename T>
+int prep_pointers(std::size_t *dim_lens, T ***ptr_of_pointers, T **mat){
+    std::size_t ndim = 2;
+    std::size_t ptr_size = 1;
+    std::size_t total_size = 1;
+    std::size_t dim_iter[ndim];
+    std::size_t unit_chunk_lens[ndim];
+
+    for (std::size_t i = 0; i < ndim-1; i++){
+        ptr_size *= dim_lens[i];
+        unit_chunk_lens[i] = 1;
+    }
+    unit_chunk_lens[ndim-1] = 1;
+    (*ptr_of_pointers) = (T **)calloc(ptr_size,sizeof(T *));
+
+    total_size = ptr_size * dim_lens[ndim-1];
+    std::size_t ptr_i = 0;
+    for (std::size_t idx = 0; idx < total_size; idx++){
+        index_offset_calc(ndim, unit_chunk_lens, dim_lens, idx, &dim_iter[0]);
+        if (idx%dim_lens[ndim-1] == 0){
+            (*ptr_of_pointers)[ptr_i] = &mat[dim_iter[0]][0];
+            ptr_i++;
+        }
+    }
+    return 0;
+};
+
+template <typename T>
+int prep_pointers(std::size_t *dim_lens, T ***ptr_of_pointers, T ***mat){
+    std::size_t ndim = 3;
+    std::size_t ptr_size = 1;
+    std::size_t total_size = 1;
+    std::size_t dim_iter[ndim];
+    std::size_t unit_chunk_lens[ndim];
+
+    for (std::size_t i = 0; i < ndim-1; i++){
+        ptr_size *= dim_lens[i];
+        unit_chunk_lens[i] = 1;
+    }
+    unit_chunk_lens[ndim-1] = 1;
+    (*ptr_of_pointers) = (T **)calloc(ptr_size,sizeof(T *));
+
+    total_size = ptr_size * dim_lens[ndim-1];
+    std::size_t ptr_i = 0;
+    for (std::size_t idx = 0; idx < total_size; idx++){
+        index_offset_calc(ndim, unit_chunk_lens, dim_lens, idx, &dim_iter[0]);
+        if (idx%dim_lens[ndim-1] == 0){
+            (*ptr_of_pointers)[ptr_i] = &mat[dim_iter[0]][dim_iter[1]][0];
+            ptr_i++;
+        }
+    }
+    return 0;
+};
+
+template <typename T>
+int prep_pointers(std::size_t *dim_lens, T ***ptr_of_pointers, T ****mat){
+    std::size_t ndim = 4;
+    std::size_t ptr_size = 1;
+    std::size_t total_size = 1;
+    std::size_t dim_iter[ndim];
+    std::size_t unit_chunk_lens[ndim];
+
+    for (std::size_t i = 0; i < ndim-1; i++){
+        ptr_size *= dim_lens[i];
+        unit_chunk_lens[i] = 1;
+    }
+    unit_chunk_lens[ndim-1] = 1;
+    (*ptr_of_pointers) = (T **)calloc(ptr_size,sizeof(T *));
+
+    total_size = ptr_size * dim_lens[ndim-1];
+    std::size_t ptr_i = 0;
+    for (std::size_t idx = 0; idx < total_size; idx++){
+        index_offset_calc(ndim, unit_chunk_lens, dim_lens, idx, &dim_iter[0]);
+        if (idx%dim_lens[ndim-1] == 0){
+            (*ptr_of_pointers)[ptr_i] = &mat[dim_iter[0]][dim_iter[1]][dim_iter[2]][0];
+            ptr_i++;
+        }
+    }
+    return 0;
+};
+
+template <typename T>
+int prep_pointers(std::size_t *dim_lens, T ***ptr_of_pointers, T *****mat){
+    std::size_t ndim = 5;
+    std::size_t ptr_size = 1;
+    std::size_t total_size = 1;
+    std::size_t dim_iter[ndim];
+    std::size_t unit_chunk_lens[ndim];
+
+    for (std::size_t i = 0; i < ndim-1; i++){
+        ptr_size *= dim_lens[i];
+        unit_chunk_lens[i] = 1;
+    }
+    unit_chunk_lens[ndim-1] = 1;
+    (*ptr_of_pointers) = (T **)calloc(ptr_size,sizeof(T *));
+
+    total_size = ptr_size * dim_lens[ndim-1];
+    std::size_t ptr_i = 0;
+    for (std::size_t idx = 0; idx < total_size; idx++){
+        index_offset_calc(ndim, unit_chunk_lens, dim_lens, idx, &dim_iter[0]);
+        if (idx%dim_lens[ndim-1] == 0){
+            (*ptr_of_pointers)[ptr_i] = &mat[dim_iter[0]][dim_iter[1]][dim_iter[2]][dim_iter[3]][0];
+            ptr_i++;
+        }
+    }
+    return 0;
+};
+
+template <typename T>
+int prep_pointers(std::size_t *dim_lens, T ***ptr_of_pointers, T ******mat){
+    std::size_t ndim = 6;
+    std::size_t ptr_size = 1;
+    std::size_t total_size = 1;
+    std::size_t dim_iter[ndim];
+    std::size_t unit_chunk_lens[ndim];
+
+    for (std::size_t i = 0; i < ndim-1; i++){
+        ptr_size *= dim_lens[i];
+        unit_chunk_lens[i] = 1;
+    }
+    unit_chunk_lens[ndim-1] = 1;
+    (*ptr_of_pointers) = (T **)calloc(ptr_size,sizeof(T *));
+
+    total_size = ptr_size * dim_lens[ndim-1];
+    std::size_t ptr_i = 0;
+    for (std::size_t idx = 0; idx < total_size; idx++){
+        index_offset_calc(ndim, unit_chunk_lens, dim_lens, idx, &dim_iter[0]);
+        if (idx%dim_lens[ndim-1] == 0){
+            (*ptr_of_pointers)[ptr_i] = &mat[dim_iter[0]][dim_iter[1]][dim_iter[2]]\
+                                            [dim_iter[3]][dim_iter[4]][0];
+            ptr_i++;
+        }
+    }
+    return 0;
+};
+
+template <typename T>
+int flatten_old(std::size_t cur_dim, std::size_t total_ndim, std::size_t *sizes, std::size_t *indices, \
+            T **ptr_of_pointers, T **buffer_1D){
+    if (cur_dim < total_ndim - 1){
+        if (cur_dim == 0){
+            std::size_t total_size = 1;
+            for (std::size_t i = 0; i < total_ndim; i++){
+                total_size *= sizes[i];
+            }
+            *buffer_1D = (T *)calloc(total_size, sizeof(T));
+        }
+
+        for(std::size_t i = 0; i < sizes[cur_dim]; i++){
+            indices[cur_dim] = i;
+            flatten(cur_dim + 1, total_ndim, sizes, indices, ptr_of_pointers, buffer_1D);
+        }
+    }else{
+        std::size_t idx = 0;
+        std::size_t buff_idx = 0;
+        std::size_t temp_idx = 0;
+        std::size_t *cumu_sizes;
+        T *ptr;
+        cumu_sizes = (std::size_t *) calloc(total_ndim, sizeof(std::size_t));
+        for (std::size_t i = 0; i < total_ndim; i++){
+            cumu_sizes[i] = 1;
+            for (std::size_t j = i+1; j < total_ndim - 1; j++){
+                cumu_sizes[i] *= sizes[j];
+            }
+        }
+
+        for (std::size_t i = 0; i < total_ndim - 1; i++){
+            idx += indices[i]*cumu_sizes[i];
+        }
+        ptr = ptr_of_pointers[idx];
+        temp_idx = idx*sizes[total_ndim-1];
+        for (std::size_t i = 0; i < sizes[total_ndim-1]; i++){
+            buff_idx = temp_idx + i;
+            (*buffer_1D)[buff_idx] = ptr[i];
+        }
+        free(cumu_sizes);
+    }
+    return 0;
+};
+
+template <typename T>
+int flatten(std::size_t total_ndim, std::size_t *dim_lens, \
+                 T **ptr_of_pointers, T **buffer_1D){
+    std::size_t ptr_size = 1;
+    std::size_t total_size = 1;
+    std::size_t j, idx;
+    for (std::size_t i = 0; i < total_ndim - 1; i++){
+        ptr_size *= dim_lens[i];
+    }
+    total_size = ptr_size * dim_lens[total_ndim - 1];
+    *buffer_1D = (T *)calloc(total_size, sizeof(T));
+
+    for (std::size_t i = 0; i < total_size; i++){
+        j = i%dim_lens[total_ndim - 1];
+        idx = i/dim_lens[total_ndim - 1];
+        (*buffer_1D)[i] = ptr_of_pointers[idx][j];
+    }
+
+    return 0;
+};
+
+template <typename T>
+int unflatten(std::size_t total_ndim, std::size_t *dim_lens, \
+              T **ptr_of_pointers, T **buffer_1D){
+    std::size_t ptr_size = 1;
+    std::size_t total_size = 1;
+    std::size_t j, idx;
+    for (std::size_t i = 0; i < total_ndim - 1; i++){
+        ptr_size *= dim_lens[i];
+    }
+    total_size = ptr_size * dim_lens[total_ndim - 1];
+
+    for (std::size_t i = 0; i < total_size; i++){
+        j = i%dim_lens[total_ndim - 1];
+        idx = i/dim_lens[total_ndim - 1];
+        ptr_of_pointers[idx][j] = (*buffer_1D)[i];
+    }
+
+    return 0;
+};
+
 /* Description of the std::vector<hid_t> output, h5_ids.
   h5_ids[0] = fapl_id
   h5_ids[1] = fcpl_id
@@ -73,7 +294,7 @@ int write_H5_ndim_data_T(hid_t *h5_ids, hid_t *data_id, std::size_t *fspace_idx,
     hid_t datatype = H5Dget_type(*data_id);
     hid_t dataspace = H5Dget_space(*data_id);// the in-file dataset's dataspace.
     int ndims = H5Sget_simple_extent_ndims(dataspace);
-    std::vector<std::size_t> dim_lens(ndims);
+    std::size_t dim_lens[ndims];
     //printf("fspace_idxs: %u %u task_local_sizes: %u %u matrix_entry: %f\n", fspace_idx[0], fspace_idx[1], task_local_sizes[0], task_local_sizes[1], data_write[0]);
 
     std::size_t *offsets;
@@ -159,4 +380,53 @@ int read_H5_ndim_data_T(hid_t* h5_ids, hid_t *data_id, std::size_t *fspace_idx,
 
     return 0;
 };
+
+template <typename T>
+int flatten_and_write_2D(hid_t *h5_ids, hid_t *data_id, std::size_t fspace_offset, \
+                         std::size_t *dim_lens, T ***write_buffer){
+    std::size_t ndims = 2;
+    std::size_t size_1D = 1;
+    T **ptr_of_pointers;
+    T *buffer_1D;
+    prep_pointers(dim_lens, &ptr_of_pointers, (*write_buffer));
+    flatten(ndims, dim_lens, ptr_of_pointers, &buffer_1D);
+    free(ptr_of_pointers);
+   
+    for (std::size_t i = 0; i < ndims; i++){
+        size_1D *= dim_lens[i];
+    }
+     
+    write_H5_ndim_data_L(h5_ids, data_id, &fspace_offset, &size_1D, &buffer_1D[0]);
+    free(buffer_1D);    
+
+    return 0;
+};
+
+template <typename T>
+int read_and_unflatten_2D(hid_t *h5_ids, hid_t *data_id, std::size_t fspace_offset, \
+                        std::size_t *dim_lens, T ***read_buffer){
+    std::size_t ndims = 2;
+    std::size_t size_1D = 1;
+    T **ptr_of_pointers;
+    T *buffer_1D;
+
+    for (std::size_t i = 0; i < ndims; i++){
+        size_1D *= dim_lens[i];
+    }
+    buffer_1D = (T *)calloc(size_1D, sizeof(T));
+
+    (*read_buffer) = (T **)calloc(dim_lens[0], sizeof(T *));
+    for (std::size_t i = 0; i < dim_lens[0]; i++){
+        (*read_buffer)[i] = (T *)calloc(dim_lens[1], sizeof(T));
+    }
+    
+    prep_pointers(dim_lens, &ptr_of_pointers, (*read_buffer));
+    read_H5_ndim_data_L(h5_ids, data_id, &fspace_offset, &size_1D, buffer_1D);
+    unflatten(ndims, dim_lens, ptr_of_pointers, &buffer_1D);
+    free(ptr_of_pointers);
+    free(buffer_1D);
+
+    return 0;
+}
+
 #endif