Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
L
libNAM
Manage
Activity
Members
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container registry
Model registry
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Andreas Galonska
libNAM
Commits
822e247e
Commit
822e247e
authored
Mar 7, 2017
by
Andreas Galonska
Browse files
Options
Downloads
Patches
Plain Diff
debugging the restart process
parent
6b27a7b8
No related branches found
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
include/nam_ext_interface.h
+14
-11
14 additions, 11 deletions
include/nam_ext_interface.h
src/nam_ext_interface.c
+11
-1
11 additions, 1 deletion
src/nam_ext_interface.c
with
25 additions
and
12 deletions
include/nam_ext_interface.h
+
14
−
11
View file @
822e247e
...
@@ -48,7 +48,7 @@ ssize_t nam_get_max_bytes(nam_ext_allocation_t *ext_alloc);
...
@@ -48,7 +48,7 @@ ssize_t nam_get_max_bytes(nam_ext_allocation_t *ext_alloc);
* @param recv_bytes size of the data to be gathered
* @param recv_bytes size of the data to be gathered
* @param root the root process which gathers the data
* @param root the root process which gathers the data
* @param ext_alloc allocation on the NAM
* @param ext_alloc allocation on the NAM
* @return
* @return
0 if gather was successful, 1 if not
*/
*/
int
nam_gather
(
size_t
offset
,
void
*
sendbuf
,
size_t
send_bytes
,
void
*
recvbuf
,
size_t
recv_bytes
,
int
root
,
nam_ext_allocation_t
*
ext_alloc
);
int
nam_gather
(
size_t
offset
,
void
*
sendbuf
,
size_t
send_bytes
,
void
*
recvbuf
,
size_t
recv_bytes
,
int
root
,
nam_ext_allocation_t
*
ext_alloc
);
...
@@ -62,7 +62,7 @@ int nam_gather(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf, s
...
@@ -62,7 +62,7 @@ int nam_gather(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf, s
* @param recv_bytes size of the data to be scattered
* @param recv_bytes size of the data to be scattered
* @param root the root process which scatters the data
* @param root the root process which scatters the data
* @param alloc allocation on the NAM
* @param alloc allocation on the NAM
* @return
* @return
0 if scatter was successful, 1 if not
*/
*/
int
nam_scatter
(
size_t
offset
,
void
*
sendbuf
,
size_t
send_bytes
,
void
*
recvbuf
,
size_t
recv_bytes
,
int
root
,
nam_ext_allocation_t
*
ext_alloc
);
int
nam_scatter
(
size_t
offset
,
void
*
sendbuf
,
size_t
send_bytes
,
void
*
recvbuf
,
size_t
recv_bytes
,
int
root
,
nam_ext_allocation_t
*
ext_alloc
);
...
@@ -74,7 +74,7 @@ int nam_scatter(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf,
...
@@ -74,7 +74,7 @@ int nam_scatter(size_t offset, void *sendbuf, size_t send_bytes, void *recvbuf,
* @param bytes size of the data to be broadcasted
* @param bytes size of the data to be broadcasted
* @param root the root process which scatters the data
* @param root the root process which scatters the data
* @param alloc allocation on the NAM
* @param alloc allocation on the NAM
* @return
* @return
0 if bcast was successful, 1 if not
*/
*/
int
nam_bcast
(
size_t
offset
,
void
*
buf
,
size_t
bytes
,
int
root
,
nam_ext_allocation_t
*
ext_alloc
);
int
nam_bcast
(
size_t
offset
,
void
*
buf
,
size_t
bytes
,
int
root
,
nam_ext_allocation_t
*
ext_alloc
);
...
@@ -87,7 +87,7 @@ int nam_bcast(size_t offset, void *buf, size_t bytes, int root, nam_ext_allocati
...
@@ -87,7 +87,7 @@ int nam_bcast(size_t offset, void *buf, size_t bytes, int root, nam_ext_allocati
* @param comm MPI Communicator to be used
* @param comm MPI Communicator to be used
* @param alloc allocation on the NAM
* @param alloc allocation on the NAM
* @param req request to wait on in this asynchronous operation
* @param req request to wait on in this asynchronous operation
* @return
* @return
0 if checkpoint was successful, 1 if not
*/
*/
int
nam_checkpoint_async
(
void
*
buf
,
size_t
bytes
,
int
root
,
MPI_Comm
comm
,
nam_ext_allocation_t
*
ext_alloc
,
nam_async_request_t
*
req
);
int
nam_checkpoint_async
(
void
*
buf
,
size_t
bytes
,
int
root
,
MPI_Comm
comm
,
nam_ext_allocation_t
*
ext_alloc
,
nam_async_request_t
*
req
);
...
@@ -97,7 +97,7 @@ int nam_checkpoint_async(void *buf, size_t bytes, int root, MPI_Comm comm,nam_ex
...
@@ -97,7 +97,7 @@ int nam_checkpoint_async(void *buf, size_t bytes, int root, MPI_Comm comm,nam_ex
* @param buf the data to participate in a XOR checkpoint. Must not be altered during the operation!
* @param buf the data to participate in a XOR checkpoint. Must not be altered during the operation!
* @param bytes size of the local data
* @param bytes size of the local data
* @param alloc allocation on the NAM
* @param alloc allocation on the NAM
* @return
* @return
0 if checkpoint was successful, 1 if not
*/
*/
int
nam_checkpoint_sync
(
void
*
buf
,
size_t
bytes
,
nam_ext_allocation_t
*
ext_alloc
);
int
nam_checkpoint_sync
(
void
*
buf
,
size_t
bytes
,
nam_ext_allocation_t
*
ext_alloc
);
...
@@ -108,7 +108,7 @@ int nam_checkpoint_sync(void *buf, size_t bytes, nam_ext_allocation_t *ext_alloc
...
@@ -108,7 +108,7 @@ int nam_checkpoint_sync(void *buf, size_t bytes, nam_ext_allocation_t *ext_alloc
* @param bytes size of the local data
* @param bytes size of the local data
* @param data_avail flag,that data should be rebuild by the NAM
* @param data_avail flag,that data should be rebuild by the NAM
* @param alloc allocation on the NAM
* @param alloc allocation on the NAM
* @return
* @return
0 if restart was successful, 1 if not
*/
*/
int
nam_restart_sync
(
void
*
buf
,
size_t
bytes
,
int
data_avail
,
nam_ext_allocation_t
*
ext_alloc
);
int
nam_restart_sync
(
void
*
buf
,
size_t
bytes
,
int
data_avail
,
nam_ext_allocation_t
*
ext_alloc
);
...
@@ -123,7 +123,7 @@ int nam_restart_sync(void *buf, size_t bytes, int data_avail, nam_ext_allocation
...
@@ -123,7 +123,7 @@ int nam_restart_sync(void *buf, size_t bytes, int data_avail, nam_ext_allocation
* @param comm MPI Communicator to be used
* @param comm MPI Communicator to be used
* @param alloc allocation on the NAM
* @param alloc allocation on the NAM
* @param req request to wait on in this asynchronous operation
* @param req request to wait on in this asynchronous operation
* @return
* @return
0 if restart was successful, 1 if not
*/
*/
int
nam_restart_async
(
void
*
buf
,
size_t
bytes
,
int
root
,
int
data_avail
,
nam_ext_allocation_t
*
ext_alloc
,
nam_async_request_t
*
req
);
int
nam_restart_async
(
void
*
buf
,
size_t
bytes
,
int
root
,
int
data_avail
,
nam_ext_allocation_t
*
ext_alloc
,
nam_async_request_t
*
req
);
...
@@ -139,6 +139,7 @@ int nam_restart_async(void *buf, size_t bytes, int root, int data_avail, nam_ex
...
@@ -139,6 +139,7 @@ int nam_restart_async(void *buf, size_t bytes, int root, int data_avail, nam_ex
* @param op to carry out on data vectors
* @param op to carry out on data vectors
* @param return_mode NAM_RETURN/NAM_STORE or any combination
* @param return_mode NAM_RETURN/NAM_STORE or any combination
* @param alloc allocation to get data from
* @param alloc allocation to get data from
* @return 0 if vector op was successful, 1 if not
*/
*/
int
nam_vector_op
(
size_t
first_offset
,
size_t
second_offset
,
size_t
result_offset
,
size_t
n
,
void
*
result
,
nam_datatype_t
type
,
nam_operation_t
op
,
int
return_mode
,
nam_ext_allocation_t
*
ext_alloc
);
int
nam_vector_op
(
size_t
first_offset
,
size_t
second_offset
,
size_t
result_offset
,
size_t
n
,
void
*
result
,
nam_datatype_t
type
,
nam_operation_t
op
,
int
return_mode
,
nam_ext_allocation_t
*
ext_alloc
);
...
@@ -153,6 +154,7 @@ int nam_vector_op(size_t first_offset, size_t second_offset, size_t result_offse
...
@@ -153,6 +154,7 @@ int nam_vector_op(size_t first_offset, size_t second_offset, size_t result_offse
* @param pattern to search for if OP is NAM_SEARCH_BASIC/NAM_SEARCH_ADVANCED
* @param pattern to search for if OP is NAM_SEARCH_BASIC/NAM_SEARCH_ADVANCED
* @param return_mode NAM_RETURN/NAM_STORE or any combination
* @param return_mode NAM_RETURN/NAM_STORE or any combination
* @param alloc allocation to get data from
* @param alloc allocation to get data from
* @return 0 if reduce op was successful, 1 if not
*/
*/
int
nam_reduce_op
(
size_t
offset
,
size_t
result_offset
,
size_t
n
,
void
*
result
,
nam_datatype_t
type
,
nam_operation_t
op
,
void
*
pattern
,
int
return_mode
,
nam_ext_allocation_t
*
ext_alloc
);
int
nam_reduce_op
(
size_t
offset
,
size_t
result_offset
,
size_t
n
,
void
*
result
,
nam_datatype_t
type
,
nam_operation_t
op
,
void
*
pattern
,
int
return_mode
,
nam_ext_allocation_t
*
ext_alloc
);
...
@@ -162,6 +164,7 @@ int nam_reduce_op(size_t offset, size_t result_offset, size_t n, void *result, n
...
@@ -162,6 +164,7 @@ int nam_reduce_op(size_t offset, size_t result_offset, size_t n, void *result, n
* @param size of requested memory
* @param size of requested memory
* @param root rank in comm
* @param root rank in comm
* @param comm MPI_Communicator to distribute the allocation
* @param comm MPI_Communicator to distribute the allocation
* @return Extended Allocation or NULL
*/
*/
nam_ext_allocation_t
*
nam_alloc_cprs
(
int
root
,
MPI_Comm
comm
);
nam_ext_allocation_t
*
nam_alloc_cprs
(
int
root
,
MPI_Comm
comm
);
...
@@ -171,7 +174,7 @@ nam_ext_allocation_t *nam_alloc_cprs(int root, MPI_Comm comm);
...
@@ -171,7 +174,7 @@ nam_ext_allocation_t *nam_alloc_cprs(int root, MPI_Comm comm);
* @param size size of requested memory
* @param size size of requested memory
* @param root process which requests the allocation
* @param root process which requests the allocation
* @param comm MPI Communicator where the allocation should be distributed in
* @param comm MPI Communicator where the allocation should be distributed in
* @return
* @return
Extended Allocation or NULL
*/
*/
nam_ext_allocation_t
*
nam_malloc_all
(
size_t
size
,
int
root
,
MPI_Comm
comm
);
nam_ext_allocation_t
*
nam_malloc_all
(
size_t
size
,
int
root
,
MPI_Comm
comm
);
...
@@ -181,7 +184,7 @@ nam_ext_allocation_t *nam_malloc_all(size_t size, int root, MPI_Comm comm);
...
@@ -181,7 +184,7 @@ nam_ext_allocation_t *nam_malloc_all(size_t size, int root, MPI_Comm comm);
* @param size size of requested memory
* @param size size of requested memory
* @param root process which requests the allocation
* @param root process which requests the allocation
* @param comm MPI Communicator where the allocation should be distributed in
* @param comm MPI Communicator where the allocation should be distributed in
* @return
* @return
Extended Allocation or NULL
*/
*/
nam_ext_allocation_t
*
nam_malloc_all_persistant
(
size_t
size
,
int
root
,
MPI_Comm
comm
);
nam_ext_allocation_t
*
nam_malloc_all_persistant
(
size_t
size
,
int
root
,
MPI_Comm
comm
);
...
@@ -191,7 +194,7 @@ nam_ext_allocation_t *nam_malloc_all_persistant(size_t size, int root, MPI_Comm
...
@@ -191,7 +194,7 @@ nam_ext_allocation_t *nam_malloc_all_persistant(size_t size, int root, MPI_Comm
* @param size size of one element
* @param size size of one element
* @param root process which requests the allocation
* @param root process which requests the allocation
* @param comm MPI Communicator where the allocation should be distributed in
* @param comm MPI Communicator where the allocation should be distributed in
* @return
* @return
Extended Allocation or NULL
*/
*/
nam_ext_allocation_t
*
nam_calloc_all
(
size_t
nmemb
,
size_t
size
,
int
root
,
MPI_Comm
comm
);
nam_ext_allocation_t
*
nam_calloc_all
(
size_t
nmemb
,
size_t
size
,
int
root
,
MPI_Comm
comm
);
...
@@ -200,7 +203,7 @@ nam_ext_allocation_t *nam_calloc_all(size_t nmemb, size_t size, int root, MPI_Co
...
@@ -200,7 +203,7 @@ nam_ext_allocation_t *nam_calloc_all(size_t nmemb, size_t size, int root, MPI_Co
* @param alloc Allocation to be freed
* @param alloc Allocation to be freed
* @param root process which should request the free on the NAM Manager
* @param root process which should request the free on the NAM Manager
* @param comm MPI Communicator on which the allocation should be freed
* @param comm MPI Communicator on which the allocation should be freed
* @return
* @return
0 if free was successful, 1 if not
*/
*/
int
nam_free_all
(
nam_ext_allocation_t
*
ext_alloc
);
int
nam_free_all
(
nam_ext_allocation_t
*
ext_alloc
);
...
...
This diff is collapsed.
Click to expand it.
src/nam_ext_interface.c
+
11
−
1
View file @
822e247e
...
@@ -641,6 +641,14 @@ int nam_restart(nam_cp_args_t *args)
...
@@ -641,6 +641,14 @@ int nam_restart(nam_cp_args_t *args)
nam_reset
=
0
;
//must be set in order to restart
nam_reset
=
0
;
//must be set in order to restart
extoll_reset_checkpointing
(
config_con
);
extoll_reset_checkpointing
(
config_con
);
//1. Write the number of Ranks to CR-C0
if
(
extoll_configure_num_ranks
(
size
,
config_con
))
{
nam_print
(
0
,
"Error configuring #Ranks!"
);
rc
=
1
;
goto
error
;
}
//configure all other processes
//configure all other processes
for
(
i
=
0
;
i
<
size
;
++
i
)
for
(
i
=
0
;
i
<
size
;
++
i
)
{
{
...
@@ -685,14 +693,16 @@ int nam_restart(nam_cp_args_t *args)
...
@@ -685,14 +693,16 @@ int nam_restart(nam_cp_args_t *args)
if
(
!
rc
)
if
(
!
rc
)
{
{
extoll_check_notifications_block
(
config_con
);
//when the new rank gets its noti, reconstruction is finished!
//when the new rank gets its noti, reconstruction is finished!
if
(
!
my_data_avail
)
if
(
!
my_data_avail
)
{
{
extoll_check_notifications_block
(
config_con
);
nam_get_sync
(
buf
,
0
,
my_bytes
,
alloc
);
nam_get_sync
(
buf
,
0
,
my_bytes
,
alloc
);
}
}
}
}
sleep
(
3
);
MPI_Barrier
(
comm
);
MPI_Barrier
(
comm
);
//set all memory to unactive
//set all memory to unactive
extoll_reset_memory
();
extoll_reset_memory
();
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
sign in
to comment