diff --git a/include/nam_ext_interface.h b/include/nam_ext_interface.h index 2e9a276bff11591d8db78cfd90daa570b5f05a22..aada3a5a5bbd31c0cb706259c595e30db136ca52 100644 --- a/include/nam_ext_interface.h +++ b/include/nam_ext_interface.h @@ -48,7 +48,7 @@ ssize_t nam_get_max_bytes(nam_ext_allocation_t *ext_alloc); * @param recv_bytes size of the data to be gathered * @param root the root process which gathers the data * @param ext_alloc allocation on the NAM - * @return + * @return 0 if gather was successful, 1 if not */ int nam_gather(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf, size_t recv_bytes, int root, nam_ext_allocation_t *ext_alloc); @@ -62,7 +62,7 @@ int nam_gather(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf, s * @param recv_bytes size of the data to be scattered * @param root the root process which scatters the data * @param alloc allocation on the NAM - * @return + * @return 0 if scatter was successful, 1 if not */ int nam_scatter(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf, size_t recv_bytes, int root, nam_ext_allocation_t *ext_alloc); @@ -74,7 +74,7 @@ int nam_scatter(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf, * @param bytes size of the data to be broadcasted * @param root the root process which scatters the data * @param alloc allocation on the NAM - * @return + * @return 0 if bcast was successful, 1 if not */ int nam_bcast(size_t offset, void *buf, size_t bytes, int root, nam_ext_allocation_t *ext_alloc); @@ -87,7 +87,7 @@ int nam_bcast(size_t offset, void *buf, size_t bytes, int root, nam_ext_allocati * @param comm MPI Communicator to be used * @param alloc allocation on the NAM * @param req request to wait on in this asynchronous operation - * @return + * @return 0 if checkpoint was successful, 1 if not */ int nam_checkpoint_async(void *buf, size_t bytes, int root, MPI_Comm comm,nam_ext_allocation_t *ext_alloc, nam_async_request_t *req); @@ -97,7 +97,7 @@ int nam_checkpoint_async(void *buf, size_t bytes, int root, MPI_Comm comm,nam_ex * @param buf the data to participate in a XOR checkpoint. Must not be altered during the operation! * @param bytes size of the local data * @param alloc allocation on the NAM - * @return + * @return 0 if checkpoint was successful, 1 if not */ int nam_checkpoint_sync(void *buf, size_t bytes, nam_ext_allocation_t *ext_alloc); @@ -108,7 +108,7 @@ int nam_checkpoint_sync(void *buf, size_t bytes, nam_ext_allocation_t *ext_alloc * @param bytes size of the local data * @param data_avail flag,that data should be rebuild by the NAM * @param alloc allocation on the NAM - * @return + * @return 0 if restart was successful, 1 if not */ int nam_restart_sync(void *buf, size_t bytes, int data_avail, nam_ext_allocation_t *ext_alloc); @@ -123,7 +123,7 @@ int nam_restart_sync(void *buf, size_t bytes, int data_avail, nam_ext_allocation * @param comm MPI Communicator to be used * @param alloc allocation on the NAM * @param req request to wait on in this asynchronous operation - * @return + * @return 0 if restart was successful, 1 if not */ int nam_restart_async(void *buf, size_t bytes, int root, int data_avail, nam_ext_allocation_t *ext_alloc, nam_async_request_t *req); @@ -139,6 +139,7 @@ int nam_restart_async(void *buf, size_t bytes, int root, int data_avail, nam_ex * @param op to carry out on data vectors * @param return_mode NAM_RETURN/NAM_STORE or any combination * @param alloc allocation to get data from + * @return 0 if vector op was successful, 1 if not */ int nam_vector_op(size_t first_offset, size_t second_offset, size_t result_offset, size_t n, void *result, nam_datatype_t type, nam_operation_t op, int return_mode, nam_ext_allocation_t *ext_alloc); @@ -153,6 +154,7 @@ int nam_vector_op(size_t first_offset, size_t second_offset, size_t result_offse * @param pattern to search for if OP is NAM_SEARCH_BASIC/NAM_SEARCH_ADVANCED * @param return_mode NAM_RETURN/NAM_STORE or any combination * @param alloc allocation to get data from + * @return 0 if reduce op was successful, 1 if not */ int nam_reduce_op(size_t offset, size_t result_offset, size_t n, void *result, nam_datatype_t type, nam_operation_t op, void *pattern, int return_mode, nam_ext_allocation_t *ext_alloc); @@ -162,6 +164,7 @@ int nam_reduce_op(size_t offset, size_t result_offset, size_t n, void *result, n * @param size of requested memory * @param root rank in comm * @param comm MPI_Communicator to distribute the allocation + * @return Extended Allocation or NULL */ nam_ext_allocation_t *nam_alloc_cprs(int root, MPI_Comm comm); @@ -171,7 +174,7 @@ nam_ext_allocation_t *nam_alloc_cprs(int root, MPI_Comm comm); * @param size size of requested memory * @param root process which requests the allocation * @param comm MPI Communicator where the allocation should be distributed in - * @return + * @return Extended Allocation or NULL */ nam_ext_allocation_t *nam_malloc_all(size_t size, int root, MPI_Comm comm); @@ -181,7 +184,7 @@ nam_ext_allocation_t *nam_malloc_all(size_t size, int root, MPI_Comm comm); * @param size size of requested memory * @param root process which requests the allocation * @param comm MPI Communicator where the allocation should be distributed in - * @return + * @return Extended Allocation or NULL */ nam_ext_allocation_t *nam_malloc_all_persistant(size_t size, int root, MPI_Comm comm); @@ -191,7 +194,7 @@ nam_ext_allocation_t *nam_malloc_all_persistant(size_t size, int root, MPI_Comm * @param size size of one element * @param root process which requests the allocation * @param comm MPI Communicator where the allocation should be distributed in - * @return + * @return Extended Allocation or NULL */ nam_ext_allocation_t *nam_calloc_all(size_t nmemb, size_t size, int root, MPI_Comm comm); @@ -200,7 +203,7 @@ nam_ext_allocation_t *nam_calloc_all(size_t nmemb, size_t size, int root, MPI_Co * @param alloc Allocation to be freed * @param root process which should request the free on the NAM Manager * @param comm MPI Communicator on which the allocation should be freed - * @return + * @return 0 if free was successful, 1 if not */ int nam_free_all(nam_ext_allocation_t *ext_alloc); diff --git a/src/nam_ext_interface.c b/src/nam_ext_interface.c index b9e18913e5c8906a993f580378e7756bbd8fcf49..c53fe565ba2d54dc4601a2b5cc9cee12fe221c38 100644 --- a/src/nam_ext_interface.c +++ b/src/nam_ext_interface.c @@ -236,8 +236,7 @@ int write_ext_allocation(char *path, nam_ext_allocation_t *ext_alloc) mode_t mode = S_IRUSR | S_IWUSR; //create path $HOME/.libNAM/<challenge>.alloc_ext sprintf(file_path, "%s/%"PRIu64".alloc_ext",nam_home, alloc->challenge); - printf("Writing file <%s> \n", file_path); - fflush(stdout); + f = fopen(file_path, "w"); if(!f) { @@ -638,6 +637,14 @@ int nam_restart(nam_cp_args_t *args) nam_reset = 0; //must be set in order to restart extoll_reset_checkpointing(config_con); + //1. Write the number of Ranks to CR-C0 + if(extoll_configure_num_ranks(size, config_con)) + { + nam_print(0, "Error configuring #Ranks!"); + rc = 1; + goto error; + } + //configure all other processes for (i = 0; i < size; ++i) { @@ -682,14 +689,16 @@ int nam_restart(nam_cp_args_t *args) if(!rc) { + extoll_check_notifications_block(config_con); //when the new rank gets its noti, reconstruction is finished! if(!my_data_avail) { - extoll_check_notifications_block(config_con); + nam_get_sync(buf, 0, my_bytes, alloc); } } + sleep(3); MPI_Barrier(comm); //set all memory to unactive extoll_reset_memory(); @@ -704,7 +713,7 @@ int nam_restart(nam_cp_args_t *args) } int nam_checkpoint(nam_cp_args_t *args) { - int size = 4; + int size = 0; int my_rank; int my_rank_global; int i; @@ -1249,7 +1258,7 @@ nam_ext_allocation_t *nam_reuse_cprs(int root, MPI_Comm comm) //split the communicator with nam index found->comm_global = comm; -// MPI_Comm_split(found->comm_global, found->alloc->nam_index, my_rank, &found->comm_local); + MPI_Comm_split(found->comm_global, found->alloc->nam_index, my_rank, &found->comm_local); //connect the allocation nam_connect_allocation(found->alloc); diff --git a/src/nam_extoll.c b/src/nam_extoll.c index e7ce40badb4449a4cb4f1fada6d3a82e061dd13a..14bb1e1350c4515870617048cfa7cde40ae729e3 100644 --- a/src/nam_extoll.c +++ b/src/nam_extoll.c @@ -1159,7 +1159,7 @@ int _extoll_encode_crc2(uint64_t byte_count, uint16_t vpid, uint16_t nodeid, int int extoll_decode_crs0(uint64_t payload, uint8_t *num_ranks, uint8_t *status_cr_setup,uint8_t *status_cr) { //allocate operands - uint64_t *operands = calloc(5, sizeof(uint64_t)); + uint64_t *operands = calloc(3, sizeof(uint64_t)); nam_print(10, "Encoded CR-S0: "HEX_FORMAT"", HEX_SPLIT(payload)); // Shift payload and AND with operands @@ -1168,12 +1168,12 @@ int extoll_decode_crs0(uint64_t payload, uint8_t *num_ranks, uint8_t *status_cr_ memcpy(num_ranks, operands + 0, sizeof(uint8_t)); //Error Status CR at Bit 48, 8 Bit - operands[3] = (payload >> 48) & 0xFF; - memcpy(status_cr, operands + 3, sizeof(uint8_t)); + operands[1] = (payload >> 48) & 0xFF; + memcpy(status_cr, operands + 1, sizeof(uint8_t)); //Error Status CR Setup at Bit 56, 8 Bit - operands[4] = (payload >> 56) & 0xFF; - memcpy(status_cr_setup, operands + 4, sizeof(uint8_t)); + operands[2] = (payload >> 56) & 0xFF; + memcpy(status_cr_setup, operands + 2, sizeof(uint8_t)); free(operands);