From 20646caa2855c9fe82165be1a41303cb1a03738a Mon Sep 17 00:00:00 2001 From: Utz-Uwe Haus <uhaus@cray.com> Date: Wed, 26 Jun 2019 09:44:38 +0200 Subject: [PATCH] some OFI work --- configure.ac | 4 + include/maestro.h | 2 +- include/maestro/conductor.h | 7 ++ include/maestro/i_conductor_protocol.h | 58 +++++++++++ include/maestro/i_ofi.h | 123 +---------------------- maestro/Makefile.am | 23 ++++- maestro/conductor_mpi.c | 0 maestro/{conductor.c => conductor_ofi.c} | 31 +++++- maestro/conductor_smp.c | 0 maestro/core.c | 2 +- maestro/ofi.c | 118 +++++++++++++++++++++- tests/check_conductor.c | 40 +++++++- 12 files changed, 275 insertions(+), 133 deletions(-) create mode 100644 include/maestro/i_conductor_protocol.h create mode 100644 maestro/conductor_mpi.c rename maestro/{conductor.c => conductor_ofi.c} (78%) create mode 100644 maestro/conductor_smp.c diff --git a/configure.ac b/configure.ac index d23aff22..de30f061 100644 --- a/configure.ac +++ b/configure.ac @@ -167,4 +167,8 @@ AC_CONFIG_FILES([ AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])]) +AM_CONDITIONAL([WITH_OFI_CONDUCTOR], [test x = x]) +AM_CONDITIONAL([WITH_MPI_CONDUCTOR], [test x = y]) +AM_CONDITIONAL([WITH_SMP_CONDUCTOR], [test x = y]) + AC_OUTPUT diff --git a/include/maestro.h b/include/maestro.h index a2bbac54..a73eb16d 100644 --- a/include/maestro.h +++ b/include/maestro.h @@ -78,7 +78,7 @@ extern "C" { ** @param component_name The component ID. ** ** @param component_index { The unique index among all processes - ** using the same @arg component_id.} + ** using the same @arg component_name.} ** ** @return { a status value, @ref MSTRO_OK on success } **/ diff --git a/include/maestro/conductor.h b/include/maestro/conductor.h index 9ed5af15..a2e4548e 100644 --- a/include/maestro/conductor.h +++ b/include/maestro/conductor.h @@ -80,6 +80,13 @@ mstro_status mstro_conductor_start(int64_t conductor_size); +/**@brief ask conductor to terminate + * + * This can be called from any participant; it will return when the conductor + * has been notified of the request, but termination can occur much later + */ +mstro_status +mstro_conductor_terminate(void); /**@} (end of group MSTRO_CONDUCTOR) */ diff --git a/include/maestro/i_conductor_protocol.h b/include/maestro/i_conductor_protocol.h new file mode 100644 index 00000000..562602f2 --- /dev/null +++ b/include/maestro/i_conductor_protocol.h @@ -0,0 +1,58 @@ +/* -*- mode:c -*- */ +/** @file + ** @brief Abstract core lib <-> conductor protocol + ** + **/ +/* + * Copyright (C) 2019 Cray Computer GmbH + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef MSTRO_CONDUCTOR_PROTOCOL_H_ +#define MSTRO_CONDUCTOR_PROTOCOL_H_ 1 +#include "maestro.h" +#include "maestro/core.h" + +enum mstro_conductor_msg_type { + MSTRO_CMSG_INVALID = 0, + /* documented component protocol messages */ + MSTRO_CMSG_DECLARE_CDO, + + /* internal messages */ + MSTRO_CMSG_TERMINATE_CONDUCTOR, + MSTRO_CMSG__MAX +}; + +#define MSG_BUF_LEN 1018 + +struct mstro_conductor_msg { + enum mstro_conductor_msg_type type; + char payload[MSG_BUF_LEN]; +}; + +#endif /* MSTRO_CONDUCTOR_PROTOCOL_H_ */ diff --git a/include/maestro/i_ofi.h b/include/maestro/i_ofi.h index c520cc77..60e218be 100644 --- a/include/maestro/i_ofi.h +++ b/include/maestro/i_ofi.h @@ -48,127 +48,10 @@ mstro_status mstro_ofi_init(void); -#include <stdint.h> -#include <stdbool.h> -#include <string.h> - -/** CDO state bitfield */ -typedef uint32_t mstro_cdo_state; - -/** invalid CDO state */ -#define MSTRO_CDO_STATE_INVALID 0 -/** CDO in declaration phase */ -#define MSTRO_CDO_STATE_CREATED (1<<0) -/** CDO fully declared */ -#define MSTRO_CDO_STATE_DECLARED (1<<1) - -/** CDO pooled by PROVIDE */ -#define MSTRO_CDO_STATE_PROVIDED (1<<2) -/** CDO pooled by REQUIRE */ -#define MSTRO_CDO_STATE_REQUIRED (1<<3) -/** CDO pooled */ -#define MSTRO_CDO_STATE_POOLED ( MSTRO_CDO_STATE_PROVIDED \ - | MSTRO_CDO_STATE_REQUIRED) - -/** CDO returned from pool by WITHDRAW */ -#define MSTRO_CDO_STATE_WITHDRAWN (1<<4) -/** CDO returned from pool by DEMAND */ -#define MSTRO_CDO_STATE_DEMANDED (1<<5) -/** CDO returned from pool by REVOKE */ -#define MSTRO_CDO_STATE_REVOKED (1<<6) -/** CDO returned from pool */ -#define MSTRO_CDO_STATE_RETURNED ( MSTRO_CDO_STATE_WITHDRAWN \ - | MSTRO_CDO_STATE_DEMANDED \ - | MSTRO_CDO_STATE_REVOKED) - -/** CDO dead */ -#define MSTRO_CDO_STATE_DEAD (1<<7) - -/** Length of a CDO ID: 128 bit UUID in bytes */ -#define MSTRO_CDO_ID_NUMBYTES (128/8) - -/** a CDO-ID is given by the octets of the binary representation of the UUID */ -struct mstro_cdo_id { - union { - uint8_t id[MSTRO_CDO_ID_NUMBYTES]; /**< octet view, network byte order */ - uint64_t qw[2]; /**< 2 quadwords view on same */ - }; -}; - -/** length of the string representation of a CDO_ID */ -#define MSTRO_CDO_ID_STR_LEN (36+1) - - -/** The NIL UUID */ -#define MSTRO_CDO_ID_INVALID ((struct mstro_cdo_id){ (uint64_t)0, (uint64_t)0 }) - - -/** - ** @brief Produce canonical string representation for CDOID in NAME. - * - * If *NAME is NULL, allocate (caller must then free). Otherwise it - * must be >= UDJO_CDO_ID_STR_LEN or longer and is simply written - * to. - * - * Remember that for a stack-allocated array - * char foo[MSTRO_CDO_ID_STR_LEN]; - * it is not enough to use &foo as argument; you need to use &dest where - * char *dest=&foo[0]; - */ -mstro_status -mstro_cdo_id__str(const struct mstro_cdo_id *cdoid, char **name); - -/** produce an integral hash for CDOID in *hash. */ -static inline -mstro_status -mstro_cdo_id__hash(const struct mstro_cdo_id *cdoid, uint32_t *hash) -{ - if(cdoid==NULL) - return MSTRO_INVARG; - if(hash==NULL) - return MSTRO_INVOUT; - - *hash = 0; - for (size_t i = MSTRO_CDO_ID_NUMBYTES; i-- > 0;) { - *hash <<= 8; - *hash |= cdoid->id[i]; - } - return MSTRO_OK; -} - -/** Compare CDO IDs */ -static inline -bool -mstro_cdo_id__equal(const struct mstro_cdo_id id1, - const struct mstro_cdo_id id2) -{ - if(memcmp(&id1.id, &id2.id, MSTRO_CDO_ID_NUMBYTES)==0) - return true; - else - return false; -} - -/** - ** @brief populate the (preallocated) CDO ID from name +/**@brief Run Conductor loop on OFI endpoints ** - ** Computes a v5 SHA-1 UUID for NAME and write it to the preallocated *result **/ mstro_status -mstro_cdo_id_from_name(const unsigned char *name, - struct mstro_cdo_id *result); - - - - -/** the structure describing a CDO */ -struct mstro_cdo_ { - /* attributes */ - struct mstro_cdo_id id; /**< the locally unique ID */ - struct mstro_cdo_id gid; /**< the globally unique ID */ - mstro_cdo_state state; /**< current state of the CDO */ - unsigned char *name; - /* scope object */ - /* ... */ -}; +mstro_ofi_conductor_loop(void); -#endif /* MAESTRO_I_CDO_H_ */ +#endif /* MAESTRO_I_OFI_H_ */ diff --git a/maestro/Makefile.am b/maestro/Makefile.am index 06e983b0..ed37be74 100644 --- a/maestro/Makefile.am +++ b/maestro/Makefile.am @@ -53,11 +53,11 @@ libmaestro_core_la_SOURCES = \ $(top_srcdir)/include/maestro/i_uuid_str.h \ $(top_srcdir)/include/maestro/i_uuid_ui64.h \ $(top_srcdir)/include/maestro/i_drc.h \ - $(top_srcdir)/include/maestro/i_ofi.h \ $(top_srcdir)/include/maestro/conductor.h \ $(top_srcdir)/include/maestro/env.h \ $(top_srcdir)/include/maestro/logging.h \ $(top_srcdir)/include/maestro/i_tpl.h \ + $(top_srcdir)/include/maestro/i_conductor_protocol.h \ \ cdo.c \ cdoid.c \ @@ -66,7 +66,22 @@ libmaestro_core_la_SOURCES = \ version.c \ status.c \ uuid.c uuid_sha1.c uuid_str.c uuid_ui128.c uuid_ui64.c base64.c\ - drc.c ofi.c \ - env.c logging.c tpl.c \ - conductor.c + drc.c \ + env.c logging.c tpl.c + +# Conductor implementation varies depending on configuration +if WITH_OFI_CONDUCTOR +libmaestro_core_la_SOURCES+=conductor_ofi.c ofi.c \ + $(top_srcdir)/include/maestro/i_ofi.h +else +if WITH_MPI_CONDUCTOR +libmaestro_core_la_SOURCES+=conductor_mpi.c +else +if WITH_SMP_CONDUCTOR +libmaestro_core_la_SOURCES+=conductor_smp.c +else +libmaestro_core_la_SOURCES+=conductor_dummy.c +endif +endif +endif diff --git a/maestro/conductor_mpi.c b/maestro/conductor_mpi.c new file mode 100644 index 00000000..e69de29b diff --git a/maestro/conductor.c b/maestro/conductor_ofi.c similarity index 78% rename from maestro/conductor.c rename to maestro/conductor_ofi.c index c9066551..c12af077 100644 --- a/maestro/conductor.c +++ b/maestro/conductor_ofi.c @@ -68,15 +68,38 @@ mstro_conductor_start(int64_t conductor_size) /* organize quorum for conductor_size */ /* create connection */ + /* create openfabric endpoint on all interfaces, listen on them, * create advertisable destination string */ stat = mstro_ofi_init(); - - /* run conductor loop */ - sleep (2); + /* all endpoints are now enabled. */ + if(stat!=MSTRO_OK) { + ERR("mstro_ofi_init() failed\n"); + } else { + /* run conductor loop. It will return after receiving a TERM message */ + stat = mstro_ofi_conductor_loop(); + } - return MSTRO_UNIMPL; + INFO("Conductor loop terminated\n"); + return stat; } +mstro_status +mstro_conductor_terminate(void) +{ + mstro_status status = MSTRO_UNIMPL; +#if 0 + struct mstro_conductor_msg *msg = mstro_conductor_msg_create(MSTRO_CMSG_TERMINATE_CONDUCTOR); + if(!msg) { + status = MSTRO_NOMEM; + goto BAILOUT; + } else { + /* no payload */ + status = mstro_u2c_submit_wait(msg); /* send and wait for completion; completion handler cleans up msg */ + } +#endif + + return status; +} diff --git a/maestro/conductor_smp.c b/maestro/conductor_smp.c new file mode 100644 index 00000000..e69de29b diff --git a/maestro/core.c b/maestro/core.c index 4ab1067a..4421c6c2 100644 --- a/maestro/core.c +++ b/maestro/core.c @@ -107,7 +107,7 @@ mstro_core_finalize(void) ERR("mstro_core_finalize() without matching mstro_core_init() call\n"); } else { struct mstro_core_initdata *d = (struct mstro_core_initdata*)ptr; - DEBUG("finalize %s/%s/%"PRIi64 " in thread %" PRIxPTR, + DEBUG("finalize %s/%s/%"PRIi64 " in thread %" PRIxPTR "\n", d->workflow_name, d->component_name, d->component_index, (intptr_t)pthread_self()); diff --git a/maestro/ofi.c b/maestro/ofi.c index cc40e531..cf6aada7 100644 --- a/maestro/ofi.c +++ b/maestro/ofi.c @@ -3,6 +3,7 @@ #include "maestro/i_utlist.h" #include "maestro/i_tpl.h" #include "maestro/i_base64.h" +#include "maestro/i_conductor_protocol.h" #include <rdma/fabric.h> #include <rdma/fi_cm.h> @@ -24,7 +25,7 @@ #endif #include <string.h> - +#include <stdbool.h> @@ -907,6 +908,7 @@ mstro_ep_build_from_ofi(struct mstro_endpoint *dst, char *buf=NULL; size_t buflen = 0; + /* create fabric object */ stat = fi_fabric(fi->fabric_attr, &fabric, NULL); if(stat!=0) { @@ -944,7 +946,12 @@ mstro_ep_build_from_ofi(struct mstro_endpoint *dst, memset(&cq_attr,0, sizeof(cq_attr)); cq_attr.format = FI_CQ_FORMAT_TAGGED; cq_attr.size = 0; /* provider-specific default. Consider fi->rx_attr->size; */ - cq_attr.wait_obj = FI_WAIT_UNSPEC; + // FIXME: currently on macOS I get a 'not supported' for some endpoints if set to FI_WAIT_UNSPEC + //cq_attr.wait_obj = FI_WAIT_UNSPEC; + // FD also does not work, and MUTEX_COND only works for some. + // but leaving it at 0 means NONE, which means we can't do blocking waits... + // MUTEX_COND at least selects all nice IP addresses on macOS (dropping localhost etc) + cq_attr.wait_obj = FI_WAIT_MUTEX_COND; stat = fi_cq_open(domain, &cq_attr, &cq, NULL); if(stat!=0) { ERR("fi_cq_open failed: %d (%s)\n", stat, fi_strerror(-stat)); @@ -968,6 +975,8 @@ mstro_ep_build_from_ofi(struct mstro_endpoint *dst, ERR("fi_endpoint failed: %d (%s)\n", stat, fi_strerror(-stat)); retstat=MSTRO_FAIL; goto BAILOUT_FAIL; } + + //INFO("This is a %s\n", fi_tostr(fi,FI_TYPE_INFO)); /* connectionless endpoint needs address vector */ memset(&av_attr, 0, sizeof(av_attr)); @@ -1183,3 +1192,108 @@ BAILOUT_FAIL: BAILOUT: return retstat; } + + + + +mstro_status +mstro_conductor_handle_msg(struct mstro_conductor_msg *msg) { + /* for now we serialize things in here. Instead we could push to a queue and have a queue runner in a different thread pick things up */ + mstro_status status=MSTRO_UNIMPL; + + switch(msg->type) { + case MSTRO_CMSG_DECLARE_CDO: + INFO("DECLARE_CDO message received\n"); + break; + case MSTRO_CMSG_TERMINATE_CONDUCTOR: + ERR("TERMINATE_CONDUCTOR message should be handled in main loop\n"); + break; + case MSTRO_CMSG_INVALID: + default: + ERR("Illegal message %d\n", msg->type); + break; + } + return status; +} + +mstro_status +mstro_ofi_conductor_loop(void) +{ + mstro_status status = MSTRO_UNIMPL; + size_t i; + struct mstro_conductor_msg** slots=calloc(g_endpoints->size, sizeof(struct mstro_conductor_msg*)); + if(slots==NULL) { + status=MSTRO_NOMEM; + goto BAILOUT; + } + + /* post receives on all endpoints */ + for(i=0; i<g_endpoints->size; i++) { + slots[i] = malloc(sizeof(struct mstro_conductor_msg)); + if(slots[i]==NULL) { + status = MSTRO_NOMEM; + goto BAILOUT; + } + + fi_recv(g_endpoints->eps[i].ep, slots[i], MSG_BUF_LEN, + NULL, FI_ADDR_UNSPEC, NULL); + } + DEBUG("Posted a RECV on each endpoint\n"); + + /* wait for completion on all of them. If a message arrives, handle it and re-post a receive */ + bool terminate = false; + do { + for(i=0; !terminate && i<g_endpoints->size; i++) { + struct fi_cq_entry entry; + int ret; + ret = fi_cq_read(g_endpoints->eps[i].cq, &entry, 1); + DEBUG("fi_cq_read returned\n"); + if(ret>0) { + /* we know that it's message in slot i. FIXME: if we had multiple outstanding recvs per ep we'd need to pass the buffer in the context of the cq */ + INFO("Incoming message of type %d on endpoint %d\n", slots[i]->type, i); + /* process */ + if(slots[i]->type==MSTRO_CMSG_TERMINATE_CONDUCTOR) { + WARN("FIXME: Not checking permission of sender to terminate conductor\n"); + status=MSTRO_OK; + terminate=true; + break; /* out of for loop */ + } + + status = mstro_conductor_handle_msg(slots[i]); + if(status!=MSTRO_OK) { + ERR("Error handling incoming message, aborting\n"); + terminate=true; + break; + } else { + /* repost buffer */ + fi_recv(g_endpoints->eps[i].ep, slots[i], MSG_BUF_LEN, + NULL, FI_ADDR_UNSPEC, NULL); + } + } else { + if (ret != -FI_EAGAIN) { + struct fi_cq_err_entry err_entry; + fi_cq_readerr(g_endpoints->eps[i].cq, &err_entry, 0); + ERR("Failure in fi_cq_read: %s %s\n", + fi_strerror(err_entry.err), + fi_cq_strerror(g_endpoints->eps[i].cq, + err_entry.prov_errno, err_entry.err_data, NULL, 0)); + status = MSTRO_FAIL; + terminate=true; + break; + } else { + ; /* EAGAIN .. we'll be back */ + } + } + } + } while(!terminate); + + +BAILOUT: + if(slots) { + for(i=0; i<g_endpoints->size; i++) + NFREE(slots[i]); + free(slots); + } + return status; +} + diff --git a/tests/check_conductor.c b/tests/check_conductor.c index e5523e1e..0339a90f 100644 --- a/tests/check_conductor.c +++ b/tests/check_conductor.c @@ -40,12 +40,50 @@ #include "maestro.h" #include "maestro/conductor.h" #include <string.h> +#include <stdio.h> +/* to simplify the test (i.e., not run MPI or somesuch) we run two threads: one conductor, one that terminates it */ +#include <pthread.h> + +CHEAT_DECLARE( +mstro_status cancel_fun_retval; +void * +cancel_conductor_fun(void* args) +{ + mstro_status *s = &cancel_fun_retval; + *s = mstro_init("Tests", "CONDUCTOR_CANCEL_THREAD",0); + if(s!=MSTRO_OK) { + fprintf(stderr, "Failed to initialize cancel thread component, status %d\n", s); + goto BAILOUT; + } + *s = mstro_conductor_terminate(); + if(s!=MSTRO_OK) { + fprintf(stderr, "Failed to signal conductor, status %d\n", s); + goto BAILOUT; + } + *s = mstro_finalize(); + if(s!=MSTRO_OK) { + fprintf(stderr, "Failed to finalize cancel thread component, status %d\n", s); + goto BAILOUT; + } +BAILOUT: + pthread_exit(s); +} +) + + +CHEAT_TEST(cdo_start_stop_conductor, + exit(77); +#if 0 + pthread_t cancel_thread; + cheat_assert(0==pthread_create(&cancel_thread,NULL, cancel_conductor_fun, NULL)); -CHEAT_TEST(cdo_declare_works, cheat_assert(MSTRO_OK == mstro_init("Tests","CONDUCTOR",0)); + cheat_assert(MSTRO_OK == mstro_conductor_start(1)); + cheat_assert(MSTRO_OK == mstro_finalize()); +#endif ) -- GitLab