Skip to content
Snippets Groups Projects
Commit 822e247e authored by Andreas Galonska's avatar Andreas Galonska
Browse files

debugging the restart process

parent 6b27a7b8
No related branches found
No related tags found
No related merge requests found
...@@ -48,7 +48,7 @@ ssize_t nam_get_max_bytes(nam_ext_allocation_t *ext_alloc); ...@@ -48,7 +48,7 @@ ssize_t nam_get_max_bytes(nam_ext_allocation_t *ext_alloc);
* @param recv_bytes size of the data to be gathered * @param recv_bytes size of the data to be gathered
* @param root the root process which gathers the data * @param root the root process which gathers the data
* @param ext_alloc allocation on the NAM * @param ext_alloc allocation on the NAM
* @return * @return 0 if gather was successful, 1 if not
*/ */
int nam_gather(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf, size_t recv_bytes, int root, nam_ext_allocation_t *ext_alloc); int nam_gather(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf, size_t recv_bytes, int root, nam_ext_allocation_t *ext_alloc);
...@@ -62,7 +62,7 @@ int nam_gather(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf, s ...@@ -62,7 +62,7 @@ int nam_gather(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf, s
* @param recv_bytes size of the data to be scattered * @param recv_bytes size of the data to be scattered
* @param root the root process which scatters the data * @param root the root process which scatters the data
* @param alloc allocation on the NAM * @param alloc allocation on the NAM
* @return * @return 0 if scatter was successful, 1 if not
*/ */
int nam_scatter(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf, size_t recv_bytes, int root, nam_ext_allocation_t *ext_alloc); int nam_scatter(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf, size_t recv_bytes, int root, nam_ext_allocation_t *ext_alloc);
...@@ -74,7 +74,7 @@ int nam_scatter(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf, ...@@ -74,7 +74,7 @@ int nam_scatter(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf,
* @param bytes size of the data to be broadcasted * @param bytes size of the data to be broadcasted
* @param root the root process which scatters the data * @param root the root process which scatters the data
* @param alloc allocation on the NAM * @param alloc allocation on the NAM
* @return * @return 0 if bcast was successful, 1 if not
*/ */
int nam_bcast(size_t offset, void *buf, size_t bytes, int root, nam_ext_allocation_t *ext_alloc); int nam_bcast(size_t offset, void *buf, size_t bytes, int root, nam_ext_allocation_t *ext_alloc);
...@@ -87,7 +87,7 @@ int nam_bcast(size_t offset, void *buf, size_t bytes, int root, nam_ext_allocati ...@@ -87,7 +87,7 @@ int nam_bcast(size_t offset, void *buf, size_t bytes, int root, nam_ext_allocati
* @param comm MPI Communicator to be used * @param comm MPI Communicator to be used
* @param alloc allocation on the NAM * @param alloc allocation on the NAM
* @param req request to wait on in this asynchronous operation * @param req request to wait on in this asynchronous operation
* @return * @return 0 if checkpoint was successful, 1 if not
*/ */
int nam_checkpoint_async(void *buf, size_t bytes, int root, MPI_Comm comm,nam_ext_allocation_t *ext_alloc, nam_async_request_t *req); int nam_checkpoint_async(void *buf, size_t bytes, int root, MPI_Comm comm,nam_ext_allocation_t *ext_alloc, nam_async_request_t *req);
...@@ -97,7 +97,7 @@ int nam_checkpoint_async(void *buf, size_t bytes, int root, MPI_Comm comm,nam_ex ...@@ -97,7 +97,7 @@ int nam_checkpoint_async(void *buf, size_t bytes, int root, MPI_Comm comm,nam_ex
* @param buf the data to participate in a XOR checkpoint. Must not be altered during the operation! * @param buf the data to participate in a XOR checkpoint. Must not be altered during the operation!
* @param bytes size of the local data * @param bytes size of the local data
* @param alloc allocation on the NAM * @param alloc allocation on the NAM
* @return * @return 0 if checkpoint was successful, 1 if not
*/ */
int nam_checkpoint_sync(void *buf, size_t bytes, nam_ext_allocation_t *ext_alloc); int nam_checkpoint_sync(void *buf, size_t bytes, nam_ext_allocation_t *ext_alloc);
...@@ -108,7 +108,7 @@ int nam_checkpoint_sync(void *buf, size_t bytes, nam_ext_allocation_t *ext_alloc ...@@ -108,7 +108,7 @@ int nam_checkpoint_sync(void *buf, size_t bytes, nam_ext_allocation_t *ext_alloc
* @param bytes size of the local data * @param bytes size of the local data
* @param data_avail flag,that data should be rebuild by the NAM * @param data_avail flag,that data should be rebuild by the NAM
* @param alloc allocation on the NAM * @param alloc allocation on the NAM
* @return * @return 0 if restart was successful, 1 if not
*/ */
int nam_restart_sync(void *buf, size_t bytes, int data_avail, nam_ext_allocation_t *ext_alloc); int nam_restart_sync(void *buf, size_t bytes, int data_avail, nam_ext_allocation_t *ext_alloc);
...@@ -123,7 +123,7 @@ int nam_restart_sync(void *buf, size_t bytes, int data_avail, nam_ext_allocation ...@@ -123,7 +123,7 @@ int nam_restart_sync(void *buf, size_t bytes, int data_avail, nam_ext_allocation
* @param comm MPI Communicator to be used * @param comm MPI Communicator to be used
* @param alloc allocation on the NAM * @param alloc allocation on the NAM
* @param req request to wait on in this asynchronous operation * @param req request to wait on in this asynchronous operation
* @return * @return 0 if restart was successful, 1 if not
*/ */
int nam_restart_async(void *buf, size_t bytes, int root, int data_avail, nam_ext_allocation_t *ext_alloc, nam_async_request_t *req); int nam_restart_async(void *buf, size_t bytes, int root, int data_avail, nam_ext_allocation_t *ext_alloc, nam_async_request_t *req);
...@@ -139,6 +139,7 @@ int nam_restart_async(void *buf, size_t bytes, int root, int data_avail, nam_ex ...@@ -139,6 +139,7 @@ int nam_restart_async(void *buf, size_t bytes, int root, int data_avail, nam_ex
* @param op to carry out on data vectors * @param op to carry out on data vectors
* @param return_mode NAM_RETURN/NAM_STORE or any combination * @param return_mode NAM_RETURN/NAM_STORE or any combination
* @param alloc allocation to get data from * @param alloc allocation to get data from
* @return 0 if vector op was successful, 1 if not
*/ */
int nam_vector_op(size_t first_offset, size_t second_offset, size_t result_offset, size_t n, void *result, nam_datatype_t type, nam_operation_t op, int return_mode, nam_ext_allocation_t *ext_alloc); int nam_vector_op(size_t first_offset, size_t second_offset, size_t result_offset, size_t n, void *result, nam_datatype_t type, nam_operation_t op, int return_mode, nam_ext_allocation_t *ext_alloc);
...@@ -153,6 +154,7 @@ int nam_vector_op(size_t first_offset, size_t second_offset, size_t result_offse ...@@ -153,6 +154,7 @@ int nam_vector_op(size_t first_offset, size_t second_offset, size_t result_offse
* @param pattern to search for if OP is NAM_SEARCH_BASIC/NAM_SEARCH_ADVANCED * @param pattern to search for if OP is NAM_SEARCH_BASIC/NAM_SEARCH_ADVANCED
* @param return_mode NAM_RETURN/NAM_STORE or any combination * @param return_mode NAM_RETURN/NAM_STORE or any combination
* @param alloc allocation to get data from * @param alloc allocation to get data from
* @return 0 if reduce op was successful, 1 if not
*/ */
int nam_reduce_op(size_t offset, size_t result_offset, size_t n, void *result, nam_datatype_t type, nam_operation_t op, void *pattern, int return_mode, nam_ext_allocation_t *ext_alloc); int nam_reduce_op(size_t offset, size_t result_offset, size_t n, void *result, nam_datatype_t type, nam_operation_t op, void *pattern, int return_mode, nam_ext_allocation_t *ext_alloc);
...@@ -162,6 +164,7 @@ int nam_reduce_op(size_t offset, size_t result_offset, size_t n, void *result, n ...@@ -162,6 +164,7 @@ int nam_reduce_op(size_t offset, size_t result_offset, size_t n, void *result, n
* @param size of requested memory * @param size of requested memory
* @param root rank in comm * @param root rank in comm
* @param comm MPI_Communicator to distribute the allocation * @param comm MPI_Communicator to distribute the allocation
* @return Extended Allocation or NULL
*/ */
nam_ext_allocation_t *nam_alloc_cprs(int root, MPI_Comm comm); nam_ext_allocation_t *nam_alloc_cprs(int root, MPI_Comm comm);
...@@ -171,7 +174,7 @@ nam_ext_allocation_t *nam_alloc_cprs(int root, MPI_Comm comm); ...@@ -171,7 +174,7 @@ nam_ext_allocation_t *nam_alloc_cprs(int root, MPI_Comm comm);
* @param size size of requested memory * @param size size of requested memory
* @param root process which requests the allocation * @param root process which requests the allocation
* @param comm MPI Communicator where the allocation should be distributed in * @param comm MPI Communicator where the allocation should be distributed in
* @return * @return Extended Allocation or NULL
*/ */
nam_ext_allocation_t *nam_malloc_all(size_t size, int root, MPI_Comm comm); nam_ext_allocation_t *nam_malloc_all(size_t size, int root, MPI_Comm comm);
...@@ -181,7 +184,7 @@ nam_ext_allocation_t *nam_malloc_all(size_t size, int root, MPI_Comm comm); ...@@ -181,7 +184,7 @@ nam_ext_allocation_t *nam_malloc_all(size_t size, int root, MPI_Comm comm);
* @param size size of requested memory * @param size size of requested memory
* @param root process which requests the allocation * @param root process which requests the allocation
* @param comm MPI Communicator where the allocation should be distributed in * @param comm MPI Communicator where the allocation should be distributed in
* @return * @return Extended Allocation or NULL
*/ */
nam_ext_allocation_t *nam_malloc_all_persistant(size_t size, int root, MPI_Comm comm); nam_ext_allocation_t *nam_malloc_all_persistant(size_t size, int root, MPI_Comm comm);
...@@ -191,7 +194,7 @@ nam_ext_allocation_t *nam_malloc_all_persistant(size_t size, int root, MPI_Comm ...@@ -191,7 +194,7 @@ nam_ext_allocation_t *nam_malloc_all_persistant(size_t size, int root, MPI_Comm
* @param size size of one element * @param size size of one element
* @param root process which requests the allocation * @param root process which requests the allocation
* @param comm MPI Communicator where the allocation should be distributed in * @param comm MPI Communicator where the allocation should be distributed in
* @return * @return Extended Allocation or NULL
*/ */
nam_ext_allocation_t *nam_calloc_all(size_t nmemb, size_t size, int root, MPI_Comm comm); nam_ext_allocation_t *nam_calloc_all(size_t nmemb, size_t size, int root, MPI_Comm comm);
...@@ -200,7 +203,7 @@ nam_ext_allocation_t *nam_calloc_all(size_t nmemb, size_t size, int root, MPI_Co ...@@ -200,7 +203,7 @@ nam_ext_allocation_t *nam_calloc_all(size_t nmemb, size_t size, int root, MPI_Co
* @param alloc Allocation to be freed * @param alloc Allocation to be freed
* @param root process which should request the free on the NAM Manager * @param root process which should request the free on the NAM Manager
* @param comm MPI Communicator on which the allocation should be freed * @param comm MPI Communicator on which the allocation should be freed
* @return * @return 0 if free was successful, 1 if not
*/ */
int nam_free_all(nam_ext_allocation_t *ext_alloc); int nam_free_all(nam_ext_allocation_t *ext_alloc);
......
...@@ -641,6 +641,14 @@ int nam_restart(nam_cp_args_t *args) ...@@ -641,6 +641,14 @@ int nam_restart(nam_cp_args_t *args)
nam_reset = 0; //must be set in order to restart nam_reset = 0; //must be set in order to restart
extoll_reset_checkpointing(config_con); extoll_reset_checkpointing(config_con);
//1. Write the number of Ranks to CR-C0
if(extoll_configure_num_ranks(size, config_con))
{
nam_print(0, "Error configuring #Ranks!");
rc = 1;
goto error;
}
//configure all other processes //configure all other processes
for (i = 0; i < size; ++i) for (i = 0; i < size; ++i)
{ {
...@@ -685,14 +693,16 @@ int nam_restart(nam_cp_args_t *args) ...@@ -685,14 +693,16 @@ int nam_restart(nam_cp_args_t *args)
if(!rc) if(!rc)
{ {
extoll_check_notifications_block(config_con);
//when the new rank gets its noti, reconstruction is finished! //when the new rank gets its noti, reconstruction is finished!
if(!my_data_avail) if(!my_data_avail)
{ {
extoll_check_notifications_block(config_con);
nam_get_sync(buf, 0, my_bytes, alloc); nam_get_sync(buf, 0, my_bytes, alloc);
} }
} }
sleep(3);
MPI_Barrier(comm); MPI_Barrier(comm);
//set all memory to unactive //set all memory to unactive
extoll_reset_memory(); extoll_reset_memory();
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment