diff --git a/include/maestro/i_ofi.h b/include/maestro/i_ofi.h index cfb509478b49979aafbbed050ff0407b739240fa..7b825a91db1d7970f2a7f4ffb96cb6f2be1a3093 100644 --- a/include/maestro/i_ofi.h +++ b/include/maestro/i_ofi.h @@ -140,6 +140,17 @@ mstro_ofi__remember_ctx(struct mstro_endpoint *ep, mstro_status mstro_ofi__maybe_bind_and_enable_mreg(struct fid_ep *ep, int mr_mode, struct fid_mr *mr); +/** Check if we need to register local buffers for fi_read (writes into buffer) + * - Check if it host and device memory @arg mem_layer + * - Register memory regoin @arg buf + * - Check if binding registration to endpoint is needed + * @arg mr and @arg local_buf_mr_desc are output variables that could be NULL if registeration is not required + */ +mstro_status +mstro_ofi__maybe_register_as_read_target_buffer( + int mem_layer, int mr_mode, struct fid_ep *ep, + struct fid_domain *domain, const void* buf, size_t len, + struct fid_mr **mr, void **local_buf_mr_desc); /* a hash table for the set of live message contexts */ KHASH_INIT(ctxtab, mstro_ofi_msg_context, diff --git a/maestro/ofi.c b/maestro/ofi.c index 473b851261a8393a13544049c722f01d953cc12e..2996b0306a1a6d942db4315866fbad24bd3f6bbd 100644 --- a/maestro/ofi.c +++ b/maestro/ofi.c @@ -883,6 +883,95 @@ mstro_mr_key_get(struct fi_info* fi, struct fid_mr* mr, return MSTRO_OK; } +static inline +mstro_status +mstro_ofi__maybe_register_as_read_target_buffer_host( + int mr_mode, struct fid_ep *ep, + struct fid_domain *domain, const void* buf, size_t len, + struct fid_mr **mr, void **local_buf_mr_desc) +{ + mstro_status status = MSTRO_UNIMPL; + if(mr_mode & FI_MR_LOCAL) { + uint64_t requested_key = mr_mode & FI_MR_PROV_KEY ? 0 : mstro_memory_new_key(); + + int err = fi_mr_reg(domain,buf, len, + FI_READ, 0, requested_key, 0, mr, NULL); + if (err) { + ERR("Couldn't register memory region for RDMA transport (err: %d, %s)\n", + err, fi_strerror(-err)); + return MSTRO_OFI_FAIL; + } + *local_buf_mr_desc = fi_mr_desc(*mr); + DEBUG("Registered memory region for RDMA transport successfully\n"); + /*maybe bind memory to ep */ + status = mstro_ofi__maybe_bind_and_enable_mreg(ep,mr_mode, *mr); + + } + else + { + DEBUG("Fabric provider does not require registering host local buffers\n"); + *mr=NULL; + *local_buf_mr_desc = NULL; + status= MSTRO_OK; + } + return status; +} +static inline +mstro_status +mstro_ofi__maybe_register_as_read_target_buffer_device( + int mr_mode, struct fid_ep *ep, + struct fid_domain *domain, const void* buf, size_t len, + struct fid_mr **mr, void **local_buf_mr_desc) +{ + mstro_status status = MSTRO_UNIMPL; + if(mr_mode & FI_MR_HMEM) { + //TODO add device memory registeration code here + } + else + { + DEBUG("Fabric provider does not require registering device local buffers\n"); + status= MSTRO_OK; + } + return status; +} +/** Check if we need to register fi read local buffers (writes into) + * - Check if it host and device memory + * - Register memory regoin + * - Check if binding registration to endpoint is needed + * from https://ofiwg.github.io/libfabric/v1.15.0/man/fi_mr.3.html + * FI_LOCAL_MR is deprecated with 1.5 and above + * FI_MR_LOCAL means you always need to register memory, even for receive operations. + * NULL should be used for desc if no registration was done. + * FI_MR_HMEM requires us to register any device memory passed in (when FI_HMEM is used). + * FI_MR_MMU_NOTIFY indicates that we need to call fi_mr_refresh if the memory region is backed by physical pages only later. + * We always pin memory, so we can support this flag but never need to call refresh (currently) + * FI_MR_COLLECTIVE requires registration. This is for later if we ever start using libfabric collectives + * + * So only if MR_LOCAL or HMEM is set do we need registration on the receiving side. + */ +mstro_status +mstro_ofi__maybe_register_as_read_target_buffer( + int mem_layer, int mr_mode, struct fid_ep *ep, + struct fid_domain *domain, const void* buf, size_t len, + struct fid_mr **mr, void **local_buf_mr_desc) +{ + mstro_status status = MSTRO_UNIMPL; + if(mem_layer == 0) { /* host */ + status = mstro_ofi__maybe_register_as_read_target_buffer_host( + mr_mode,ep, domain, buf, len, mr, local_buf_mr_desc); + } + else if(mem_layer == 1){ /* device */ + status = mstro_ofi__maybe_register_as_read_target_buffer_device( + mr_mode,ep, domain, buf, len, mr, local_buf_mr_desc); + } + else + { + ERR("Unknown buffer memory layer\n"); + + } + return status; +} + /** check if @arg mr_mode indicates that @arg mr needs to be bound to @arg ep and enabled before being usable. If so: do it */ mstro_status mstro_ofi__maybe_bind_and_enable_mreg(struct fid_ep *ep, int mr_mode, struct fid_mr *mr) @@ -1837,7 +1926,7 @@ mstro_ofi_init(void) /* we really want 1.14 or above */ stat = fi_getinfo(MSTRO_OFI_VERSION, NULL, NULL, 0, hints, &fi); fi_freeinfo(hints); - + if(stat!=0) { ERR("fi_getinfo failed: %d (%s)\n", stat, fi_strerror(-stat)); retstat=MSTRO_FAIL; goto BAILOUT_FAIL; @@ -1921,36 +2010,31 @@ mstro_ofi_init(void) struct fid_mr *pm_info_mr = NULL; const struct fi_info* fi = g_endpoints->eps[g_endpoints->size].fi; - - uint64_t requested_key = fi->domain_attr->mr_mode & FI_MR_PROV_KEY ? 0 : mstro_memory_new_key(); - int ret = fi_mr_reg(g_endpoints->eps[g_endpoints->size].domain, - &g_pm_component_descriptor, sizeof(g_pm_component_descriptor), - FI_READ, 0, requested_key, 0, &pm_info_mr, NULL); - if(ret<0) { - ERR("Failed to register component descriptor read buffer on domain of ep %zu, skipping: %d (%s)\n", - g_endpoints->size, ret, fi_strerror(-ret)); - // drop memlock that ep_build_from_ofi did - retstat = mstro_memunlock(&g_component_descriptor, sizeof(g_component_descriptor)); - goto TRY_NEXT_EP; - } - DEBUG("registered peer buffer for RDMA, addr %p, mr 0x%" PRIx64 ", desc %p\n", - &g_pm_component_descriptor, pm_info_mr, fi_mr_desc(pm_info_mr)); - - /* some domains do per-endpoint memory registration, cater to that */ - retstat = mstro_ofi__maybe_bind_and_enable_mreg(g_endpoints->eps[g_endpoints->size].ep, - fi->domain_attr->mr_mode, - pm_info_mr); - if(retstat!=MSTRO_OK) { - ERR("Failed to bind peer buffer mreg to endpoint %zu, skipping\n", - g_endpoints->size); - ret = fi_close((struct fid *)pm_info_mr); - if(ret<0) { - ERR("Failed to close PM-info mreg: %d (%s)\n", ret, strerror(-ret)); - retstat=MSTRO_OFI_FAIL; - } - retstat |= mstro_memunlock(&g_component_descriptor, sizeof(g_component_descriptor)); - goto TRY_NEXT_EP; + void* local_buf_mr_desc = NULL; + /*maybe register and bind local component descriptor buffers for reads if needed*/ + retstat = mstro_ofi__maybe_register_as_read_target_buffer( + 0, /*host memory layer */ + fi->domain_attr->mr_mode, g_endpoints->eps[g_endpoints->size].ep, + g_endpoints->eps[g_endpoints->size].domain, &g_pm_component_descriptor,sizeof(g_pm_component_descriptor), + &pm_info_mr, &local_buf_mr_desc); + if(retstat != MSTRO_OK) { + ERR("Failed to register or bind component descriptor read buffer on domain of ep %zu\n", + g_endpoints->size); + if (pm_info_mr != NULL) { + int ret = fi_close((struct fid *)pm_info_mr); + if(ret<0) { + ERR("Failed to close PM-info mreg: %d (%s)\n", ret, strerror(-ret)); + retstat=MSTRO_OFI_FAIL; + } + } + retstat |= mstro_memunlock(&g_component_descriptor, sizeof(g_component_descriptor)); + goto TRY_NEXT_EP; + } + else + { + DEBUG("registered peer buffer for RDMA, addr %p, mr 0x%" PRIx64 ", desc %p\n", + &g_pm_component_descriptor, pm_info_mr, local_buf_mr_desc); } g_endpoints->eps[g_endpoints->size].peer_info_mr = pm_info_mr; diff --git a/transport/rdma.c b/transport/rdma.c index 1e8b1cb5b746963fb1a286db2abba53a6f60ae4e..0fec36137f1b961035091e2bd082f90a89dba3d8 100644 --- a/transport/rdma.c +++ b/transport/rdma.c @@ -520,17 +520,24 @@ mstro_transport_rdma_dst_execute(mstro_cdo cdo_dst, Mstro__Pool__TransferTicket* } } - uint64_t requested_key = app_entry->ep->fi->domain_attr->mr_mode & FI_MR_PROV_KEY ? 0 : mstro_memory_new_key(); - int err = fi_mr_reg(app_entry->ep->domain,(uint8_t*) cdo_dst->raw_ptr+ticket->dst_offset, len, - FI_READ, 0, requested_key, 0, &mr, NULL); - if (err) { - ERR("Couldn't register memory region for RDMA transport (err: %d, %s)\n", - err, fi_strerror(-err)); - return MSTRO_FAIL; + + void* local_buf_mr_desc = NULL; + status = mstro_ofi__maybe_register_as_read_target_buffer( + 0, /*host memory layer TODO check cdo memory layer here and pass the correct type*/ + app_entry->ep->fi->domain_attr->mr_mode, app_entry->ep->ep, + app_entry->ep->domain, (uint8_t*) cdo_dst->raw_ptr,len, + &mr, &local_buf_mr_desc); + if(status != MSTRO_OK) { + ERR("Failed to register or bind cdo memory for RDMA transport\n"); + if (mr != NULL) + { + int ret = fi_close((struct fid *)mr); + if(ret<0) { + ERR("Failed to close dst cdo mreg: %d (%s)\n", ret, strerror(-ret)); + } + } + return status; } - - void* local_buf_mr_desc = fi_mr_desc(mr); - DEBUG("Doing closure creation\n"); closure = malloc(sizeof(struct mstro_transport_rdma_cb_args)); if(closure==NULL) { @@ -656,11 +663,18 @@ mstro_transport_rdma_cb(mstro_event ev, void* closure) return; } - // unregister - int err = fi_close((struct fid*)args->mr); - if (err) { - ERR("Couldn't unregister memory region for RDMA transport (err: %d)\n", err); - args->status = MSTRO_FAIL; + /* Unregister memory region */ + /* Check if it was registered at the first place */ + if (args->mr != NULL) + { + int err = fi_close((struct fid*)args->mr); + if (err) { + ERR("Couldn't unregister memory region for RDMA transport (err: %d)\n", err); + args->status = MSTRO_FAIL; + } + } + else { + DEBUG("Local memory register was not registered for RDMA (not required) \n"); } // send completion to PM and sender