diff --git a/deps/libfabric/.gitignore b/deps/libfabric/.gitignore
index 25708537ef3e8bf1a9c5a95be053860fd158b76c..437aee891ee9fd9c4a57d7b071b18cf8ec54337f 100644
--- a/deps/libfabric/.gitignore
+++ b/deps/libfabric/.gitignore
@@ -73,4 +73,5 @@ fabtests/ubertest/fi_ubertest
 fabtests/benchmarks/fi_*
 fabtests/functional/fi_*
 fabtests/unit/fi_*
+fabtests/multinode/fi_*
 pingpong/fi_*
diff --git a/deps/libfabric/.travis.yml b/deps/libfabric/.travis.yml
index 9c4c83775c4c4b7442370cd0803befe875317133..3be4f54e00c3efa2decbfe222a1ebb602d470d44 100644
--- a/deps/libfabric/.travis.yml
+++ b/deps/libfabric/.travis.yml
@@ -33,6 +33,19 @@ addons:
             - wget
             - abi-compliance-checker
             - abi-dumper
+    coverity_scan:
+      project:
+        name: "ofiwg/libfabric"
+        description: "Libfabric project coverity scans"
+      notification_email: sean.hefty@intel.com
+      build_command_prepend: "./autogen.sh; ./configure"
+      build_command: "make -j2"
+      # It might be overkill to run a full scan across the compiler test matrix
+      # for every PR to master. The coverity addon can not selectively run for
+      # certain OSes or compilers. Once a couple runs succeed, change this to a
+      # coverity-scan branch that we push to on-demand during releases or as
+      # needed..
+      branch_pattern: master
 
 env:
     global:
@@ -48,6 +61,8 @@ env:
         #- MAKE_FLAGS="AM_CFLAGS=-Werror"
         - MAKE_FLAGS=
         - ASAN_OPTIONS=detect_leaks=0
+        # Encrypted COVERITY_SCAN_TOKEN
+        - secure: "gDU1pbiuGsuPHezMp0X2DEC9+bBu2F+XDqR93JMkIzHNI7ygQX/kXeJT6ly9MH60paSpIolfQFNA6QotKtpZ62X3a9wrhv3In1viB+EJr1wmsPrKfprI+JfZYevPLTn6LUQM0d2zoclRWNJzY/uldc6bEaXXxDKIaRk8pgmNZR4="
 
 # Brew update GNU Autotools so that autogen can succeed
 before_install:
diff --git a/deps/libfabric/AUTHORS b/deps/libfabric/AUTHORS
index 278c4ebc90b29c4dfb455e754012376c824784a7..642b4cb6e6614b59923355a449f5155ee1142f41 100644
--- a/deps/libfabric/AUTHORS
+++ b/deps/libfabric/AUTHORS
@@ -10,6 +10,8 @@ Ana Guerrero López <ana@ekaia.org>
 Anatoliy Rozanov <anatoliy.rozanov@intel.com>
 Andrew Friedley <andrew.friedley@intel.com>
 Andrey Lobanov <andrey.lobanov@intel.com>
+Anthony Zinger <anthony.zinger@hpe.com>
+Ao Li <aolia@amazon.com>
 Arun C Ilango <arun.ilango@intel.com>
 arun ilango <a-ilango@users.noreply.github.com>
 Arun Ilango <arun.ilango@intel.com>
@@ -22,6 +24,7 @@ Ben Turrubiates <bturrubiates@lanl.gov>
 Ben Turrubiates <bturrubi@cisco.com>
 Bernd Schubert <bschubert@ddn.com>
 Brian Barrett <bbarrett@amazon.com>
+Brian J. Murrell <brian@interlinx.bc.ca>
 Brian Li <brian14708@gmail.com>
 Chang Hyun Park <heartinpiece@gmail.com>
 Charles J Archer <charles.j.archer@intel.com>
@@ -55,6 +58,7 @@ Hefty <sean.hefty@intel.com>
 Holger Hoffstätte <holger@applied-asynchrony.com>
 Honggang Li <honli@redhat.com>
 Howard Pritchard <howardp@lanl.gov>
+Hui Zhou <hzhou321@anl.gov>
 Ian Ziemba <ian.ziemba@hpe.com>
 Ignacio Hernandez <ignacio.hernandez@intel.com>
 Ira Weiny <ira.weiny@intel.com>
@@ -62,10 +66,12 @@ Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
 James Dinan <james.dinan@intel.com>
 James Shimek <jshimek@cray.com>
 James Swaro <james.swaro@gmail.com>
+James Swaro <james.swaro@hpe.com>
 James Swaro <jswaro@cray.com>
 Jason Godfrey <godfrey@cray.com>
 Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
 Jay Sternberg <jay.e.sternberg@intel.com>
+Jean-Yves VET <jyvet@ddn.com>
 Jeff Hammond <jeff.r.hammond@intel.com>
 Jeff Hammond <jeff.science@gmail.com>
 Jeff Squyres <jsquyres@cisco.com>
@@ -84,11 +90,13 @@ Jonathan Behrens <fintelia@gmail.com>
 jose <jose@cst-fs.(none)>
 jose <jose@cstnh-8.(none)>
 JoZie <JoZie@users.noreply.github.com>
+jroznova <julia.roznova@intel.com>
 Ken Raffenetti <raffenet@mcs.anl.gov>
 Kevan rehm <krehm@cray.com>
 Kevan Rehm <krehm@cray.com>
 kseager <kayla.seager@intel.com>
 Latchesar Ionkov <lionkov@lanl.gov>
+Leena Radeke <leena.radeke@hpe.com>
 Lisanna Dettwyler <levi.e.dettwyler@intel.com>
 Lisanna Dettwyler <lisanna.dettwyler@intel.com>
 Marcin Salnik <marcin.salnik@intel.com>
@@ -111,12 +119,16 @@ Oblomov, Sergey <hoopoepg@gmail.com>
 Oblomov, Sergey <sergey.oblomov@intel.com>
 OFIWG Bot <ofiwg@lists.openfabrics.org>
 Paolo Inaudi <p91paul@gmail.com>
+patrickbueb <70724661+patrickbueb@users.noreply.github.com>
+Patrick Bueb <patrick.bueb@hpe.com>
 Patrick MacArthur <pmacarth@iol.unh.edu>
 Patrick McCormick <patrick.m.mccormick@intel.com>
 Paul Coffman <pcoffman@anl.gov>
 Pavan Balaji <balaji@anl.gov>
+Peter Gottesman <pgottes@amazon.com>
 Peter Gottesman <pgottesm@cisco.com>
 Phil Carns <carns@mcs.anl.gov>
+Philip Davis <philipdavis01@gmail.com>
 Pierre Roux <piroux@cisco.com>
 Prankur Gupta <prankgup@cisco.com>
 Raghu Raja <craghun@amazon.com>
@@ -147,15 +159,18 @@ Thibault BREZILLON <thibault.brezillon@techsat.com>
 Thomas Smith <thomasm2@cisco.com>
 Tony Zinger <ajz@cray.com>
 tonyzinger <ajz@cray.com>
+Trevor Hendricks <trevorhe@amazon.com>
 Venkata Krishna Nimmagadda <nvkrishna85@gmail.com>
 Venkata Krishna Nimmagadda <venkata.krishna.nimmagadda@intel.com>
 Wei Zhang <wzam@amazon.com>
 Wesley Bland <wesley.bland@intel.com>
 William Zhang <wilzhang@amazon.com>
+Xuezhao Liu <xuezhao.liu@intel.com>
 Xuyang Wang <xuywang@cisco.com>
 Yohann Burette <yohann.burette@intel.com>
 yohann <yohann.burette@intel.com>
 Yulu Jia <yulu.jia@intel.com>
+Zach Tiffany <zachary.tiffany@hpe.com>
 Zach Tiffany <ztiffany@cray.com>
 Zach <ztiffany@cray.com>
 ztaylor <ztaylor@twitter.com>
diff --git a/deps/libfabric/Makefile.am b/deps/libfabric/Makefile.am
index f2a6e27f027e05c343fb7ec7dd15e0176f4af5c1..b857e7ddc3ba0364aefb729d87d338b8cbb78a3e 100644
--- a/deps/libfabric/Makefile.am
+++ b/deps/libfabric/Makefile.am
@@ -2,6 +2,7 @@
 # Copyright (c) 2016 Cisco Systems, Inc.  All rights reserved.
 # Copyright (c) 2017-2018 Intel Corporation, Inc. All right reserved.
 # Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All rights reserved.
+# (C) Copyright 2020 Hewlett Packard Enterprise Development LP
 #
 # Makefile.am for libfabric
 
@@ -39,6 +40,10 @@ rdmainclude_HEADERS =
 
 # internal utility functions shared by in-tree providers:
 common_srcs =				\
+	src/hmem.c			\
+	src/hmem_rocr.c			\
+	src/hmem_cuda.c			\
+	src/hmem_ze.c			\
 	src/common.c			\
 	src/enosys.c			\
 	src/rbtree.c			\
@@ -68,6 +73,8 @@ common_srcs =				\
 	prov/util/src/util_mem_monitor.c\
 	prov/util/src/util_mem_hooks.c	\
 	prov/util/src/util_mr_cache.c	\
+	prov/util/src/cuda_mem_monitor.c \
+	prov/util/src/rocr_mem_monitor.c \
 	prov/util/src/util_coll.c
 
 
@@ -118,6 +125,7 @@ util_fi_pingpong_LDADD = $(linkback)
 
 nodist_src_libfabric_la_SOURCES =
 src_libfabric_la_SOURCES =			\
+	include/ofi_hmem.h			\
 	include/ofi.h				\
 	include/ofi_abi.h			\
 	include/ofi_atom.h			\
@@ -175,7 +183,7 @@ src_libfabric_la_LIBADD =
 src_libfabric_la_DEPENDENCIES = libfabric.map
 
 if !EMBEDDED
-src_libfabric_la_LDFLAGS += -version-info 14:1:13
+src_libfabric_la_LDFLAGS += -version-info 15:1:14
 endif
 src_libfabric_la_LDFLAGS += -export-dynamic \
 			   $(libfabric_version_script)
diff --git a/deps/libfabric/NEWS.md b/deps/libfabric/NEWS.md
index 666d1e072df488db14757e83d779f6571d2c3be7..27d4efa4f361ec5b6bb543f55f9d00b2479d4077 100644
--- a/deps/libfabric/NEWS.md
+++ b/deps/libfabric/NEWS.md
@@ -5,6 +5,233 @@ This file contains the main features as well as overviews of specific
 bug fixes (and other actions) for each version of Libfabric since
 version 1.0.
 
+v1.11.1, Fri Oct 9, 2021
+========================
+
+## Core
+
+- Remove calls to cuInit to prevent indirect call to fork
+- Ignore case when comparing provider names
+- Prevent layering util providers over EFA
+- Fix segfault if passed a NULL address to print
+- Fail build if CUDA is requested but not available
+
+## EFA
+
+- Switch to memhooks monitor
+- Avoid potential deadlock copying data to GPU buffers
+- Allow creating packet pools with non-huge pages
+- Check return value when processing data packets
+- Minor code restructuring and bug fixes
+- Check if outstanding TX limit has been reached prior to sending
+- Move RDMA read registration to post time
+- Do not overwrite a packet's associated MR when copying packets
+- Pass in correct packet when determining the header size
+- Do not release rx_entry in EAGAIN case
+- Disable MR cache if fork support is requested
+- Turn off MR cache if user supports FI_MR_LOCAL
+- Add FI_REMOTE_READ to shm registrations
+- Remove use_cnt assert closing domain to allow driver cleanup
+- Fix off by 1 returned AV address when using AV map
+- Ensure setting FI_HMEM capability is backwards compatible
+
+## RxD
+
+- Fix bug that prevents sending timely ACKs for segmented messages
+- Remove calls that recursively try to acquire the EP lock
+
+## RxM
+
+- Allow re-connecting to peers
+
+## SHM
+
+- Create duplicate fi_info's when reporting FI_HMEM support
+- Handle transfers larger than 2GB
+- Register for signal using SA_ONSTACK
+- Fix segfault if peer has not been inserted intqqo local AV
+- Fix command/buffer tracking for sending connection requests
+- Return proper errno on AV lookup failures
+- Remove duplicate call to ofi_hmem_init
+- Fix using incorrect peer id for mid-sized message transfers
+- Fix addressing race conditions
+- Fix mixing of shm AV index values with fi_addr_t values
+- Fix initialization synchronization
+- Ensure progress is invoked for mid-sized message transfers
+- Always use CMA when sending data to self
+- Fix hang using SAR protocol
+
+## Sockets
+
+- Retry address lookup for messages received during CM setup
+
+## TCP
+
+- Fix possible deadlock during EP shutdown due lock inversion
+- Rework CM state maching to fix lock inversion handling disconnect
+
+## Util
+
+- Correctly mark if addresses support local/remote communication
+- Check madvise memhook advice
+- Update mmap intercept hook function
+- Replace memhooks implementation to intercept syscalls
+- Fix shmat intercept hook handling
+- Fix error handling obtaining page sizes
+- Fix incorrect locking in MR cache
+- Fix memory leak in rbtree cleanup
+
+## Verbs
+
+- Fix XRC transport shared INI QP locking
+- Account for off-by-one flow control credit issue
+- Fix disabling of receive queue flow control
+- Reduce overall memory footprint on fully connected apps
+- Skip reporting native IB addresses when network interface is requested
+
+v1.11.0, Fri Aug 14, 2020
+=========================
+
+## Core
+
+- Add generalized hmem_ops interface for device ops
+- Add FI_HMEM_CUDA, FI_HMEM_ROCR, and FI_HMEM_ZE interfaces and device support
+- Add CUDA and ROCR memory monitors and support for multiple monitors
+- Add fi_tostr for FI_HMEM_* interfaces
+- Add utility interface and device support
+- Add documentation for hmem override ops
+- Save mr_map mem_desc as ofi_mr
+- Rework and reorganize memory monitor code
+- Add mr_cache argument flush_lru to ofi_mr_cache_flush
+- Fix 1.1 ABI domain, EP, and tx attributes
+- Add loading of DL providers by name
+- Add CMA wrappers and define CMA for OSX
+- Fix util getinfo: use base fi_info caps, altering mr_mode properly,
+  FI_MR_HMEM support, NULL hints, set CQ FI_MSG flag, query FI_COLLECTIVE,
+  list FI_MATCH_COMPLETE, select and request specific core provider
+- Add rbmap interface to get root node
+- Add support of AF_IB to addr manipulation functions
+- Windows: Map strtok_r() to strtok_s()
+- Define OFI_IB_IP_{PORT,PS}_MASK
+- Make fi_addr_format() public
+- Remove mr_cache entry subscribed field
+- Update memhooks brk and implement sbrk intercepts
+- Fix vrb_speed units
+- Fix possible null dereference in ofi_create_filter
+- Add ofi_idx_ordered_remove
+- Add functions ofi_generate_seed() and ofi_xorshift_random_r()
+- Call correct close fd call in util_wait_fd_close
+- Set a libfabric default universe size
+- Add compatibility with SUSE packaging
+- Windows: Handle socket API size limitations
+- Fix UBSAN warnings
+- Save and restore the errno in FI_LOG
+- Ensure that access to atomic handlers are in range
+- Ensure ifa_name is null terminated in ofi_get_list_of_addr
+- Buffer pools fallback to normal allocations when hugepage allocations fail
+
+## EFA
+
+- Add support to use user posted receive buffers with RDM EP when requested
+- Various fixes to FI_HMEM support
+- Added fork handler and abort if rdma-core is incorrectly configured
+- Fix bandwidth regression due to increased structure size
+- Reuse verbs protection domain when in same process address space
+- Periodically flush MR cache to reduce MR usage
+- Properly handle setting/unsetting RDMAV_HUGEPAGES_SAFE
+- Fix provider_version reported by EFA
+- Populate additional fields in fid_nic
+- Fix various bugs in the completion, info, and domain paths
+- Fix various memory leaks
+
+## PSM2
+
+- Treat dynamic connection errors as fatal
+- Add missing return status checking for PSM2 AM calls
+
+## RxD
+
+- updated AV design to be dynamically extensible using indexer and index map.
+- updated static allocation of peers with runtime allocation during rts.
+- added wrapper to fetch pointer to a peer from the peers data structure.
+- Updated to show correct msg_ordering.
+- Check datatype size when handling atomic ops.
+- Verify atomic opcode in range for fixing Klocwork issue.
+- Corrected use of addr in rxd_atomic_inject for retrieving rxd_addr.
+
+## RxM
+
+- Align reporting of FI_COLLECTIVE with man pages
+- Show correct ordering of atomic operations
+- Fix error handling inserting IP addresses into an AV
+- Minor code cleanups and bug fixes
+- Select different optimizations based on running over tcp vs verbs
+- Use SRX by default when using tcp to improve scaling
+- Correct CQ size calculation when using SRX
+- Fix MR registration error path when handling iov's
+- Allow selecting tcp wait objects separate from verbs
+- Only repost Rx buffers if necessary
+
+## SHM
+
+- Fix a CMA check bug
+- Fix shm provider signal handler calling the original handler
+- Add initial framework for IPC device copies
+- Add FI_HMEM support and integrate hmem_ops
+- Fix error handling path in smr_create
+- Fix AV insertion error handling
+- Verify atomic op value
+- Redefine shm addrlen to not use NAME_MAX
+- Fix snprintf to exclude byte for null terminator
+- Mark smr_region as volatile
+- Fix memory leaks
+
+## Sockets
+
+- Fix backwards compatibility accessing struct fi_mr_attr
+- Fix use after free error in CM threads
+- Free unclaimed messages during endpoint cleanup to avoid memory leaks
+- Improve handling of socket disconnection
+- Limit time spent in progress when expected list is long
+- Avoid thread starvation by converting spinlocks to mutex
+
+## TCP
+
+- Minor bug fixes
+- Verify received opcode values are valid
+- Avoid possible receive buffer overflow from malformed packets
+- Fix fi_cq_sread failing with ECANCELED
+- Optimize receive progress handling
+- Do not alter pseudo random sequence numbers
+- Increase default listen backlog size to improve scaling
+- Handle processing of NACK packets during connection setup
+- Fix wrong error handling during passive endpoint creation
+- Add logging messages during shutdown handling
+- Improve logging and error handling
+- Fix possible use after free issues during CM setup
+- Minor code restructuring
+
+## Util
+
+- Use internal flags in place of epoll flags for portability
+- Support HMEM with the mr_cache
+- Verify application requested FI_HMEM prior to accessing fi_mr_attr fields
+- Fix memory leak when using POLLFD wait sets
+- Ensure AV data is aligned even if address length is not
+- Fix handling of mr mode bits for API < 1.5
+- Allow user to force use of userfaultfd memory monitor
+
+## Verbs
+
+- Add support for AF_IB and native IB addressing
+- Minor code cleanups
+- Avoid possible string overrun parsing interface names
+- Fix memory leak handling duplication interface names
+- Add XRC shared Rx CQ credit reservation
+- Fix possible segfault when closing an XRC SRQ
+- Fix verbs speed units to MBps
+- Add flow control support to avoid RQ overruns
+- Fix memory leak of address data when creating endpoints
 
 v1.10.1, Fri May 8, 2020
 ========================
@@ -36,7 +263,6 @@ v1.10.1, Fri May 8, 2020
 
 - Add locking around signaling a wait fd
 
-
 v1.10.0, Fri Apr 24, 2020
 =========================
 
@@ -79,6 +305,10 @@ v1.10.0, Fri Apr 24, 2020
 - Fixes shm error handling paths
 - Fixes compiler warnings
 
+## PSM2
+
+- Improve source address translation for scalable endpoints
+
 ## RxM
 
 - Add support for pollfd wait objects
diff --git a/deps/libfabric/configure.ac b/deps/libfabric/configure.ac
index 179ad33d34287e6177644e5acdb4ac65d42402e1..04ae0696f916209d8b4ffdac6028596328100914 100644
--- a/deps/libfabric/configure.ac
+++ b/deps/libfabric/configure.ac
@@ -2,11 +2,12 @@ dnl
 dnl Copyright (c) 2016 Cisco Systems, Inc.  All rights reserved.
 dnl Copyright (c) 2019 Intel, Inc.  All rights reserved.
 dnl Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+dnl (C) Copyright 2020 Hewlett Packard Enterprise Development LP
 dnl
 dnl Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.60])
-AC_INIT([libfabric], [1.10.1], [ofiwg@lists.openfabrics.org])
+AC_INIT([libfabric], [1.11.1], [ofiwg@lists.openfabrics.org])
 AC_CONFIG_SRCDIR([src/fabric.c])
 AC_CONFIG_AUX_DIR(config)
 AC_CONFIG_MACRO_DIR(config)
@@ -362,8 +363,8 @@ AS_IF([test "$icc_symver_hack"],
 	[AC_MSG_RESULT(disabled)],
 [
 
-AC_TRY_LINK([],
-	[__asm__(".symver main_, main@ABIVER_1.0");],
+AC_TRY_LINK([__asm__(".symver main_, main@ABIVER_1.0");],
+	[],
 	[
 		AC_MSG_RESULT(yes)
 		ac_asm_symver_support=1
@@ -485,19 +486,139 @@ AC_ARG_WITH([cuda],
 			    and runtime libraries are installed.])],
 	    [], [])
 
-FI_CHECK_PACKAGE([cuda],
-		 [cuda_runtime.h],
-		 [cudart],
-		 [cudaMemcpy],
-		 [-lcuda],
-		 [$with_cuda],
-		 [],
-		 [AC_DEFINE([HAVE_LIBCUDA], [1],[CUDA support])],
-		 [], [])
+have_libcuda=0
+AS_IF([test x"$with_cuda" != x"no"],
+	    [FI_CHECK_PACKAGE([cuda],
+			      [cuda_runtime.h],
+			      [cudart],
+			      [cudaMemcpy],
+			      [-lcuda],
+			      [$with_cuda],
+			      [],
+			      [have_libcuda=1],
+			      [],
+			      [])],
+	    [])
+
+AS_IF([test "$with_cuda" = "yes" && test "$have_libcuda" = "0" ],
+	[AC_MSG_ERROR([CUDA support requested but CUDA runtime not available.])],
+	[])
+AC_DEFINE_UNQUOTED([HAVE_LIBCUDA], [$have_libcuda], [Whether we have CUDA runtime or not])
+
+AC_ARG_ENABLE([cuda-dlopen],
+    [AS_HELP_STRING([--enable-cuda-dlopen],
+        [Enable dlopen of CUDA libraries @<:@default=no@:>@])
+    ],
+    [
+        AS_IF([test "$freebsd" == "0"], [
+            AC_CHECK_LIB(dl, dlopen, [],
+                [AC_MSG_ERROR([dlopen not found.  libfabric requires libdl.])])
+        ])
+        AC_DEFINE([ENABLE_CUDA_DLOPEN], [1], [dlopen CUDA libraries])
+    ],
+    [enable_cuda_dlopen=no])
+
+AC_ARG_WITH([ze],
+	AC_HELP_STRING([--with-ze=DIR], [Provide path to where the ZE
+					 libraries and headers are installed.]),
+	[], [])
+
+AS_IF([test x"$with_ze" != x"no"],
+      [FI_CHECK_PACKAGE([ze],
+			[level_zero/ze_api.h],
+			[ze_loader],
+			[zeInit],
+			[],
+			[$with_ze],
+			[],
+			[AC_DEFINE([HAVE_LIBZE], [1],[ZE support])],
+			[], [])
+       CPPFLAGS="$CPPFLAGS $ze_CPPFLAGS"
+       LDFLAGS="$LDFLAGS $ze_LDFLAGS"
+       LIBS="$LIBS $ze_LIBS"],
+      [])
+
+enable_memhooks=1
+AC_ARG_ENABLE([memhooks-monitor],
+              [AC_HELP_STRING([--disable-memhooks-monitor],
+                              [Determine whether memhooks memory monitor is disabled.])],
+              [enable_memhooks=0],
+              [])
+
+AC_DEFINE_UNQUOTED(ENABLE_MEMHOOKS_MONITOR, [$enable_memhooks],
+	[Define to 1 to enable memhooks memory monitor])
+
+AS_IF([test "$enable_memhooks" == "1"], [
+	AC_CHECK_FUNCS([__curbrk __clear_cache])
+	AC_CHECK_HEADERS([linux/mman.h sys/syscall.h])
+	AC_CHECK_DECLS([__syscall], [], [], [#include <sys/syscall.h>])
+	AC_CHECK_FUNCS([__syscall])
+	], [])
+
+enable_uffd=1
+AC_ARG_ENABLE([uffd-monitor],
+              [AC_HELP_STRING([--disable-uffd-monitor],
+                              [Determine whether uffd memory monitor is disabled.])],
+              [enable_uffd=0],
+              [])
+
+AC_DEFINE_UNQUOTED(ENABLE_UFFD_MONITOR, [$enable_uffd],
+	[Define to 1 to enable uffd memory monitor])
+
+
+AH_BOTTOM([
+#if defined(__linux__) && (defined(__x86_64__) || defined(__amd64__) || defined(__aarch64__)) && ENABLE_MEMHOOKS_MONITOR
+#define HAVE_MEMHOOKS_MONITOR 1
+#else
+#define HAVE_MEMHOOKS_MONITOR 0
+#endif
+
+#if HAVE_UFFD_UNMAP && ENABLE_UFFD_MONITOR
+#define HAVE_UFFD_MONITOR 1
+#else
+#define HAVE_UFFD_MONITOR 0
+#endif
+])
 
 CPPFLAGS="$CPPFLAGS $cuda_CPPFLAGS"
 LDFLAGS="$LDFLAGS $cuda_LDFLAGS"
-LIBS="$LIBS $cuda_LIBS"
+
+AS_IF([test x"$enable_cuda_dlopen" != x"yes"], [LIBS="$LIBS $cuda_LIBS"])
+
+dnl Check for ROCR runtime libraries.
+AC_ARG_WITH([rocr],
+	    [AC_HELP_STRING([--with-rocr=DIR],
+			    [Provide path to where the ROCR/HSA development
+			    and runtime libraries are installed.])],
+	    [], [])
+
+AC_ARG_ENABLE([rocr-dlopen],
+    [AS_HELP_STRING([--enable-rocr-dlopen],
+        [Enable dlopen of ROCR libraries @<:@default=no@:>@])
+    ],
+    [
+        AS_IF([test "$freebsd" == "0"], [
+            AC_CHECK_LIB(dl, dlopen, [],
+                [AC_MSG_ERROR([dlopen not found.  libfabric requires libdl.])])
+        ])
+        AC_DEFINE([ENABLE_ROCR_DLOPEN], [1], [dlopen ROCR libraries])
+    ],
+    [enable_rocr_dlopen=no])
+
+FI_CHECK_PACKAGE([rocr],
+		 [hsa/hsa_ext_amd.h],
+		 [hsa-runtime64],
+		 [hsa_amd_pointer_info],
+		 [],
+		 [$with_rocr],
+		 [$with_rocr/lib],
+		 [AC_DEFINE([HAVE_ROCR], [1], [ROCR HSA support])],
+		 [], [])
+
+CPPFLAGS="$CPPFLAGS $rocr_CPPFLAGS"
+LDFLAGS="$LDFLAGS $rocr_LDFLAGS"
+
+AS_IF([test x"$enable_rocr_dlopen" != x"yes"], [LIBS="$LIBS $rocr_LIBS"])
 
 dnl Provider-specific checks
 FI_PROVIDER_INIT
diff --git a/deps/libfabric/contrib/cray/Jenkinsfile.verbs b/deps/libfabric/contrib/cray/Jenkinsfile.verbs
index 3e0b1c793f6c0eff4c9d2c8d6211fcf324dbbdf0..22ad4246a095ba3017810e6268a838dac853a043 100644
--- a/deps/libfabric/contrib/cray/Jenkinsfile.verbs
+++ b/deps/libfabric/contrib/cray/Jenkinsfile.verbs
@@ -32,6 +32,9 @@ pipeline {
                 script {
                     GIT_DESCRIPTION = sh(returnStdout: true, script: "git describe --tags").trim()
                     LIBFABRIC_INSTALL = pwd tmp: true
+                    if (changeRequest()) {
+                        SFT_PR_ENV_VAR = 'SFT_PR=1'
+                    }
                 }
 
                 dir ('contrib/cray/bin') {
@@ -43,13 +46,17 @@ pipeline {
                 }
             }
         }
-        stage('Build') {
+        stage('Build CUDA and ROCR') {
             options {
                 timeout (time: 5, unit: 'MINUTES')
             }
+            environment {
+                LD_LIBRARY_PATH = "$ROCR_INSTALL_PATH/lib:$CUDA_INSTALL_PATH/lib64:$LD_LIBRARY_PATH"
+            }
             steps {
                 sh './autogen.sh'
-                sh "./configure --prefix=$LIBFABRIC_INSTALL"
+                sh """./configure --prefix=$LIBFABRIC_INSTALL --disable-memhooks-monitor \
+                   --with-cuda=$CUDA_INSTALL_PATH --with-rocr=$ROCR_INSTALL_PATH"""
                 sh "make -j 12"
                 sh "make install"
                 dir ("fabtests") {
@@ -60,12 +67,120 @@ pipeline {
                 }
             }
         }
+        stage("Verify CUDA and ROCR Build") {
+            steps {
+                script {
+                    cuda_link_count = sh(returnStdout: true,
+                                         script: """objdump -a -x $LIBFABRIC_INSTALL/lib/libfabric.so |
+                                                 grep NEED | grep cuda | wc -l""").trim()
+                    if (cuda_link_count != "2") {
+                        error("libfabric failed to link to CUDA")
+                    }
+                    rocr_link_count = sh(returnStdout: true,
+                                         script: """objdump -a -x $LIBFABRIC_INSTALL/lib/libfabric.so |
+                                                 grep NEED | grep hsa | wc -l""").trim()
+                    if (rocr_link_count != "1") {
+                        error("libfabric failed to link to ROCR")
+                    }
+                }
+            }
+        }
+        stage('Build CUDA and ROCR dlopen') {
+            options {
+                timeout (time: 5, unit: 'MINUTES')
+            }
+            steps {
+                sh './autogen.sh'
+                sh """./configure --prefix=$LIBFABRIC_INSTALL --disable-memhooks-monitor \
+                   --with-cuda=$CUDA_INSTALL_PATH --enable-cuda-dlopen \
+                   --with-rocr=$ROCR_INSTALL_PATH --enable-rocr-dlopen"""
+                sh "make -j 12"
+                sh "make install"
+                dir ("fabtests") {
+                    sh './autogen.sh'
+                    sh "./configure --with-libfabric=$LIBFABRIC_INSTALL --prefix=$FABTEST_PATH"
+                    sh "make -j12"
+                    sh "make -j12 install"
+                }
+            }
+        }
+        stage("Verify CUDA and ROCR Build dlopen") {
+            steps {
+                script {
+                    cuda_link_count = sh(returnStdout: true,
+                                         script: """objdump -a -x $LIBFABRIC_INSTALL/lib/libfabric.so |
+                                                 grep NEED | grep cuda | wc -l""").trim()
+                    if (cuda_link_count != "0") {
+                        error("libfabric failed to link to CUDA")
+                    }
+                    rocr_link_count = sh(returnStdout: true,
+                                         script: """objdump -a -x $LIBFABRIC_INSTALL/lib/libfabric.so |
+                                                 grep NEED | grep hsa | wc -l""").trim()
+                    if (rocr_link_count != "0") {
+                        error("libfabric failed to link to ROCR")
+                    }
+                }
+            }
+        }
+        stage('Build LTU') {
+            options {
+                timeout (time: 5, unit: 'MINUTES')
+            }
+            steps {
+                dir ('libfabric-test-utils') {
+                    deleteDir ()
+                }
+                dir ('libfabric-test-utils') {
+                    git url: "ssh://${env.LTU_GIT_REPO}", credentialsId: 'jenkins-nw-cje2-sshkey', branch: "${env.SRC_GIT_BRANCH}"
+                    sh "git remote -v"
+                    script {
+                        LTU_VERSION = sh(returnStdout: true, script: "cat .version").trim()
+                        GIT_SHORT_COMMIT = sh(returnStdout: true, script: "git log -n 1 --pretty=format:'%h'").trim()
+                        LTU_VERSION = "${LTU_VERSION}" + "_${GIT_SHORT_COMMIT}"
+                    }
+                    echo "*** Building libfabric-test-utils, Version: ${LTU_VERSION} ***"
+                    sh "./autogen.sh"
+                    sh """./configure --prefix=${LIBFABRIC_INSTALL} --with-libfabric=${LIBFABRIC_INSTALL} \
+                                      --with-nvidia=${CUDA_INSTALL_PATH} --with-amd=${ROCR_INSTALL_PATH} \
+                                      --with-pmi=${PMI_INSTALL_PATH} --with-pmi_include=${PMI_INCLUDE_PATH} \
+                                      --with-ltu-build-string=\"libfabric-test-utils-${LTU_VERSION}\""""
+                    sh "make -j 10"
+                    sh "make install"
+                }
+            }
+        }
+        stage('Build SFT') {
+            options {
+                timeout (time: 5, unit: 'MINUTES')
+            }
+            steps {
+                dir ('libfabric-sft') {
+                    deleteDir ()
+                }
+                dir ('libfabric-sft') {
+                    git url: "ssh://${env.SFT_GIT_REPO}", credentialsId: 'jenkins-nw-cje2-sshkey', branch: "${env.SRC_GIT_BRANCH}"
+                    sh "git remote -v"
+                    script {
+                        SFT_VERSION = sh(returnStdout: true, script: "cat .version").trim()
+                        GIT_SHORT_COMMIT = sh(returnStdout: true, script: "git log -n 1 --pretty=format:'%h'").trim()
+                        SFT_VERSION = "${SFT_VERSION}" + "_${GIT_SHORT_COMMIT}"
+                    }
+                    echo "*** Building libfabric-sft, Version: ${SFT_VERSION} ***"
+                    sh "./autogen.sh"
+                    sh """./configure --prefix=${LIBFABRIC_INSTALL} --with-libfabric=${LIBFABRIC_INSTALL} \
+                                      --with-libltu=${LIBFABRIC_INSTALL} \
+                                      --with-sft-build-string=\"libfabric-sft-${LTU_VERSION}\""""
+                    sh "make -j 10"
+                    sh "make install"
+                }
+            }
+        }
         stage('Test') {
             environment {
                 LD_LIBRARY_PATH = "$LIBFABRIC_INSTALL/lib:$LD_LIBRARY_PATH"
                 MPIR_CVAR_OFI_USE_PROVIDER = 'verbs;ofi_rxm'
                 LIBFABRIC_INSTALL_PATH = "$LIBFABRIC_INSTALL"
-                SFT_BIN = "${SFT_INSTALL_PATH + '/bin'}"
+                SFT_BIN = "${LIBFABRIC_INSTALL + '/bin'}"
                 SFT_MAX_JOB_TIME = '3'
                 SFT_NUM_JOBS = '6'
                 SFT_PROVIDER = 'verbs;ofi_rxm'
@@ -77,6 +192,7 @@ pipeline {
                 SFT_TEST_RESULTS_EXPECTED = 'expected_'
                 SFT_TEST_RESULTS_PREFIX = 'BUILD_'
                 SFT_TEST_RESULTS_CI = 'sft_ci_results.yaml'
+                FI_VERBS_MIN_RNR_TIMER= '4'
             }
             options {
                 timeout (time: 22, unit: 'MINUTES')
@@ -170,7 +286,7 @@ pipeline {
                                 try {
                                     dir ("${SFT_BIN}") {
                                         sh """
-                                            ./ci-all.sh \\
+                                            ${SFT_PR_ENV_VAR} ./ci-all.sh \\
                                                 --provider '${SFT_PROVIDER}' \\
                                                 -L ${SFT_TEST_RESULTS_DIR} \\
                                                 --num-jobs ${SFT_NUM_JOBS} \\
@@ -217,7 +333,7 @@ pipeline {
                                 try {
                                     dir ("${SFT_BIN}") {
                                         sh """
-                                            ./ci-all.sh \\
+                                            ${SFT_PR_ENV_VAR} ./ci-all.sh \\
                                                 --provider '${SFT_PROVIDER}' \\
                                                 -L ${SFT_TEST_RESULTS_DIR} \\
                                                 --num-jobs ${SFT_NUM_JOBS} \\
@@ -257,17 +373,27 @@ pipeline {
                 }
                 stage("Applications") {
                     steps {
-                        tee ('mpi.tap') {
+                        tee ('omb.tap') {
                             timeout(time: 10, unit: 'MINUTES') {
-                                sh '${BATS_INSTALL_PATH}/bats -t contrib/cray/bats/mpi.bats'
+                                sh '${BATS_INSTALL_PATH}/bats -t contrib/cray/bats/omb.bats'
                             }
                         }
+                        tee ('imb.tap') {
+                            timeout(time: 20, unit: 'MINUTES') {
+                                sh '${BATS_INSTALL_PATH}/bats -t contrib/cray/bats/imb.bats'
+			    }
+			}
                     }
                     post {
                         always {
                             sh """contrib/cray/bin/parse_logfiles.sh \\
-                                    -r mpi.tap \\
-                                    -w mpi.xml \\
+                                    -r omb.tap \\
+                                    -w omb.xml \\
+                                    tap applications.mpi applications"""
+
+                            sh """contrib/cray/bin/parse_logfiles.sh \\
+                                    -r imb.tap \\
+                                    -w imb.xml \\
                                     tap applications.mpi applications"""
                         }
                     }
@@ -298,10 +424,15 @@ pipeline {
                     step ([$class: 'XUnitPublisher',
                        thresholds: [
                             [$class: 'FailedThreshold', unstableThreshold: '0']],
-                            tools: [[$class: 'JUnitType', pattern: "mpi.xml"]]])
+                            tools: [[$class: 'JUnitType', pattern: "omb.xml"]]])
+                    step ([$class: 'XUnitPublisher',
+                       thresholds: [
+                            [$class: 'FailedThreshold', unstableThreshold: '0']],
+                            tools: [[$class: 'JUnitType', pattern: "imb.xml"]]])
                 }
                 cleanup {
-                    echo "*** Test: Post: Cleanup: env.BRANCH_NAME: ${BRANCH_NAME} ***"
+                    echo "*** Test: Post: Cleanup: env.BRANCH_NAME: ${env.BRANCH_NAME} ***"
+                    echo "*** Test: Post: Cleanup: isOfiwgBuild: " + isOfiwgBuild() + " ***"
                     script {
                         if ( isInternalBuild() ) {
                             echo "*** Test: Post: Cleanup: isInternalBuild: TRUE ***"
@@ -317,32 +448,35 @@ pipeline {
             when {
                 allOf {
                     expression { currentBuild.currentResult == 'SUCCESS' } ;
-                    expression { isInternalBuild() } ;
-                    anyOf {
-                        expression { env.BRANCH_NAME == 'master' } ;
-                        buildingTag() ;
-                    }
                 }
             }
             environment {
                 LIBFABRIC_INSTALL_PATH="${LIBFABRIC_BUILD_PATH + '/' + GIT_DESCRIPTION}"
             }
             steps {
-                sh './autogen.sh'
-                sh "./configure --prefix=$LIBFABRIC_INSTALL_PATH"
-                sh "make -j 12"
-                sh "make install"
+                script {
+                    BUILD_LIBFABRIC = 'false'
+                    if ( isInternalBuild &&
+                        (( env.BRANCH_NAME == 'master' ) || buildingTag())) {
+                        BUILD_LIBFABRIC = 'true'
+                    } else if ( isOfiwgBuild() && ( env.BRANCH_NAME == 'master' )) {
+                        LIBFABRIC_INSTALL_PATH="${LIBFABRIC_BUILD_PATH + '/' + 'OFIWG_' + GIT_DESCRIPTION}"
+                        BUILD_LIBFABRIC = 'true'
+                    }
+                    echo "*** Install Libfabric Build: BUILD_LIBFABRIC: $BUILD_LIBFABRIC ***"
+                    if ( BUILD_LIBFABRIC == 'true' ) {
+                        sh "./autogen.sh"
+                        sh "./configure --prefix=$LIBFABRIC_INSTALL_PATH --disable-memhooks-monitor"
+                        sh "make -j 12"
+                        sh "make install"
+                    }
+                }
             }
         }
         stage("Deploy") {
             when {
                 allOf {
                     expression { currentBuild.currentResult == 'SUCCESS' } ;
-                    expression { isInternalBuild() } ;
-                    anyOf {
-                        expression { env.BRANCH_NAME == 'master' } ;
-                        buildingTag()
-                    }
                 }
             }
             options {
@@ -355,7 +489,10 @@ pipeline {
             parallel {
                 stage("Create nightly link") {
                     when {
-                        expression { env.BRANCH_NAME == 'master' }
+                        allOf {
+                            expression { isInternalBuild() } ;
+                            expression { env.BRANCH_NAME == 'master' }
+                        }
                     }
                     steps {
                         dir (env.TAG_DIRECTORY) {
@@ -366,7 +503,10 @@ pipeline {
                 }
                 stage("Create tagged link") {
                     when {
-                        buildingTag()
+                        allOf {
+                            expression { isInternalBuild() } ;
+                            buildingTag()
+                        }
                     }
                     steps {
                         dir (env.TAG_DIRECTORY) {
@@ -375,6 +515,20 @@ pipeline {
                         }
                     }
                 }
+                stage("Create upstream link") {
+                    when {
+                        allOf {
+                            expression { isOfiwgBuild() } ;
+                            expression { env.BRANCH_NAME == 'master' }
+                        }
+                    }
+                    steps {
+                        dir (env.TAG_DIRECTORY) {
+                            sh "rm -f upstream || true"
+                            sh "ln -s ../OFIWG_$GIT_DESCRIPTION upstream"
+                        }
+                    }
+                }
             }
         }
     }
@@ -408,9 +562,20 @@ pipeline {
         ROOT_BUILD_PATH = "/scratch/jenkins/builds"
         FABTEST_PATH = "${WORKSPACE + '/installs/fabtests'}"
         LIBFABRIC_BUILD_PATH = "${ROOT_BUILD_PATH + '/libfabric'}"
-        OMB_BUILD_PATH = "${ROOT_BUILD_PATH + '/osu-micro-benchmarks/5.4.2/libexec/osu-micro-benchmarks/mpi'}"
-        MPICH_PATH = "${ROOT_BUILD_PATH + '/mpich/3.3'}"
+        OMB_BUILD_PATH = "${ROOT_BUILD_PATH + '/osu-micro-benchmarks/stable/libexec/osu-micro-benchmarks/mpi'}"
+        IMB_BUILD_PATH = "${ROOT_BUILD_PATH + '/imb/v2019.6'}"
+        MPICH_PATH = "${ROOT_BUILD_PATH + '/mpich/stable'}"
         SFT_INSTALL_PATH = "${ROOT_BUILD_PATH + '/libfabric-sft/stable'}"
+        SFT_PR_ENV_VAR = 'SFT_PR=0'
         BATS_INSTALL_PATH = "${ROOT_BUILD_PATH + '/bats/stable/bin'}"
+        CUDA_INSTALL_PATH = "/scratch/opt/cuda"
+        ROCR_INSTALL_PATH = "/opt/rocm"
+        PMI_INCLUDE_PATH = "/usr/include/slurm"
+        PMI_INSTALL_PATH = "/usr/lib64"
+        LTU_VERSION = "0.0.0"
+        SFT_VERSION = "0.0.0"
+        LTU_GIT_REPO = 'git@stash.us.cray.com:7999/ofi-cray/libfabric-test-utils.git'
+        SFT_GIT_REPO = 'git@stash.us.cray.com:7999/ofi-cray/libfabric-sft.git'
+        SRC_GIT_BRANCH = 'master'
     }
 }
diff --git a/deps/libfabric/contrib/cray/bats/batsgenerator.sh b/deps/libfabric/contrib/cray/bats/batsgenerator.sh
new file mode 100755
index 0000000000000000000000000000000000000000..cee576721fc60ef3dd73b7b9d12345cc7b9f38c2
--- /dev/null
+++ b/deps/libfabric/contrib/cray/bats/batsgenerator.sh
@@ -0,0 +1,52 @@
+#!/bin/bash                                                                                                                              
+# Generate a bats file to run Intel MPI Benchmarks
+# Assumes IMB test suite has been installed and included in Jenkinsfile.verbs file
+# Example:
+# Add IMB-EXT Windows test running 20 ranks, 5 ranks per node to imb.bats
+#       ./batsgenerator.sh IMB-EXT windows 20 5 imb.bats
+
+# Insert shebang and load test helper
+shebang="#!/usr/bin/env bats\n\n"
+fi_env="XRC_FI_ENV=\"FI_VERBS_XRCD_FILENAME=/tmp/xrc_imb_\$\$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1\"\n\n"
+
+# Command line input: test suite
+# E.g. IMB-EXT
+test_suite=$1
+shift
+
+# Command line input: benchmark
+# E.g. windows
+benchmark=$1
+shift
+
+# Command line input: number of ranks
+# E.g. 20
+num_ranks=$1
+shift
+
+# Command line input: number of ranks per node (rpn)
+# E.g. 5
+num_rpn=$1
+shift
+
+#Command line input: name of bats file
+# E.g. imb.bats
+bats_file=$1
+
+shift
+if [[ $# -gt 0 ]] ; then
+       iter_flag=" -iter $1"
+else
+       iter_flag=""
+fi
+
+if [ ! -f "${bats_file}" ]; then
+        printf "${shebang}load test_helper\n\n${fi_env}" >> ${bats_file}
+fi
+
+sed -e "s/@TEST_SUITE@/${test_suite}/g" \
+    -e "s/@BENCHMARK@/${benchmark}/g" \
+    -e "s/@RANKS@/${num_ranks}/g" \
+    -e "s/@RPN@/${num_rpn}/g" \
+    -e "s/@ITER_FLAG@/${iter_flag}/g" \
+    benchmark.template >> ${bats_file}
diff --git a/deps/libfabric/contrib/cray/bats/benchmark.template b/deps/libfabric/contrib/cray/bats/benchmark.template
new file mode 100644
index 0000000000000000000000000000000000000000..8544a0eeeb1d34bfcd41b451399a22780fac1e1f
--- /dev/null
+++ b/deps/libfabric/contrib/cray/bats/benchmark.template
@@ -0,0 +1,14 @@
+# RC
+@test "@TEST_SUITE@ @BENCHMARK@ @RANKS@ ranks, @RPN@ ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher @RANKS@ @RPN@) timeout 300 "$IMB_BUILD_PATH/@TEST_SUITE@ -npmin @RANKS@@ITER_FLAG@ -time 10 -mem 2 -msglog 2:18 @BENCHMARK@"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "@TEST_SUITE@ @BENCHMARK@ @RANKS@ ranks, @RPN@ ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher @RANKS@ @RPN@) timeout 300 "$IMB_BUILD_PATH/@TEST_SUITE@ -npmin @RANKS@@ITER_FLAG@ -time 10 -mem 2 -msglog 2:18 @BENCHMARK@"
+        [ "$status" -eq 0 ]
+}
diff --git a/deps/libfabric/contrib/cray/bats/imb.bats b/deps/libfabric/contrib/cray/bats/imb.bats
new file mode 100644
index 0000000000000000000000000000000000000000..4b9d95626edceeb1955e1ffbb9877f7b406556a4
--- /dev/null
+++ b/deps/libfabric/contrib/cray/bats/imb.bats
@@ -0,0 +1,664 @@
+#!/usr/bin/env bats
+
+load test_helper
+
+XRC_FI_ENV="FI_VERBS_XRCD_FILENAME=/tmp/xrc_imb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1"
+
+# RC
+@test "IMB-P2P unirandom 2 ranks, 1 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-P2P -npmin 2 -time 10 -mem 2 -msglog 2:18 unirandom"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-P2P unirandom 2 ranks, 1 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-P2P -npmin 2 -time 10 -mem 2 -msglog 2:18 unirandom"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-P2P birandom 2 ranks, 1 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-P2P -npmin 2 -time 10 -mem 2 -msglog 2:18 birandom"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-P2P birandom 2 ranks, 1 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-P2P -npmin 2 -time 10 -mem 2 -msglog 2:18 birandom"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-P2P corandom 2 ranks, 1 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-P2P -npmin 2 -time 10 -mem 2 -msglog 2:18 corandom"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-P2P corandom 2 ranks, 1 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-P2P -npmin 2 -time 10 -mem 2 -msglog 2:18 corandom"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-RMA bidir_get 2 ranks, 1 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 bidir_get"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-RMA bidir_get 2 ranks, 1 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 bidir_get"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-RMA bidir_put 2 ranks, 1 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 bidir_put"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-RMA bidir_put 2 ranks, 1 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 bidir_put"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-RMA unidir_get 2 ranks, 1 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 unidir_get"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-RMA unidir_get 2 ranks, 1 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 unidir_get"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-RMA unidir_put 2 ranks, 1 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 unidir_put"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-RMA unidir_put 2 ranks, 1 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 unidir_put"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-EXT window 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-EXT -npmin 20 -time 10 -mem 2 -msglog 2:18 window"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-EXT window 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-EXT -npmin 20 -time 10 -mem 2 -msglog 2:18 window"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-EXT accumulate 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-EXT -npmin 20 -time 10 -mem 2 -msglog 2:18 accumulate"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-EXT accumulate 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-EXT -npmin 20 -time 10 -mem 2 -msglog 2:18 accumulate"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC ialltoall 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoall"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC ialltoall 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoall"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC ialltoall_pure 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoall_pure"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC ialltoall_pure 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoall_pure"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC ialltoallv 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoallv"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC ialltoallv 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoallv"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC ialltoallv_pure 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoallv_pure"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC ialltoallv_pure 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoallv_pure"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC iallgather 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgather"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC iallgather 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgather"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC iallgather_pure 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgather_pure"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC iallgather_pure 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgather_pure"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC iallgatherv 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgatherv"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC iallgatherv 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgatherv"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC iallgatherv_pure 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgatherv_pure"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC iallgatherv_pure 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgatherv_pure"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC iallreduce 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallreduce"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC iallreduce 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallreduce"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC iallreduce_pure 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallreduce_pure"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC iallreduce_pure 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallreduce_pure"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC ibarrier 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibarrier"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC ibarrier 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibarrier"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC ibarrier_pure 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibarrier_pure"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC ibarrier_pure 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibarrier_pure"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC ibcast 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibcast"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC ibcast 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibcast"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC ibcast_pure 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibcast_pure"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC ibcast_pure 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibcast_pure"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC igather 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igather"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC igather 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igather"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC igather_pure 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igather_pure"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC igather_pure 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igather_pure"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC igatherv 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igatherv"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC igatherv 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igatherv"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC igatherv_pure 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igatherv_pure"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC igatherv_pure 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igatherv_pure"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC ireduce 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ireduce"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC ireduce 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ireduce"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC ireduce_pure 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ireduce_pure"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC ireduce_pure 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ireduce_pure"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC ireduce_scatter 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ireduce_scatter"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC ireduce_scatter 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ireduce_scatter"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC iscatter 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatter"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC iscatter 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatter"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC iscatter_pure 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatter_pure"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC iscatter_pure 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatter_pure"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC iscatterv 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatterv"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC iscatterv 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatterv"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-NBC iscatterv_pure 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatterv_pure"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-NBC iscatterv_pure 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatterv_pure"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-MPI1 reduce 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 reduce"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-MPI1 reduce 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 reduce"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-MPI1 reduce_scatter 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 reduce_scatter"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-MPI1 reduce_scatter 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 reduce_scatter"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-MPI1 reduce_scatter_block 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 reduce_scatter_block"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-MPI1 reduce_scatter_block 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 reduce_scatter_block"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-MPI1 allreduce 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 allreduce"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-MPI1 allreduce 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 allreduce"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-MPI1 allgather 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 allgather"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-MPI1 allgather 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 allgather"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-MPI1 allgatherv 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 allgatherv"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-MPI1 allgatherv 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 allgatherv"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-MPI1 scatter 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 scatter"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-MPI1 scatter 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 scatter"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-MPI1 scatterv 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 scatterv"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-MPI1 scatterv 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 scatterv"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-MPI1 gather 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 gather"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-MPI1 gather 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 gather"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-MPI1 gatherv 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 gatherv"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-MPI1 gatherv 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 gatherv"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-MPI1 alltoall 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 alltoall"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-MPI1 alltoall 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 alltoall"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-MPI1 bcast 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 bcast"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-MPI1 bcast 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 bcast"
+        [ "$status" -eq 0 ]
+}
+# RC
+@test "IMB-MPI1 barrier 20 ranks, 5 ranks per node using RC verbs" {
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 barrier"
+        [ "$status" -eq 0 ]
+}
+
+# XRC
+@test "IMB-MPI1 barrier 20 ranks, 5 ranks per node using XRC verbs" {
+        eval ${XRC_FI_ENV} \
+        run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+                $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 barrier"
+        [ "$status" -eq 0 ]
+}
diff --git a/deps/libfabric/contrib/cray/bats/mpi.bats b/deps/libfabric/contrib/cray/bats/omb.bats
similarity index 75%
rename from deps/libfabric/contrib/cray/bats/mpi.bats
rename to deps/libfabric/contrib/cray/bats/omb.bats
index 48c96e143c4281cdabf714f7fa04e41b0ab2fb4f..37693d860bf6240acc7c2a33ad351a8c4d70d07c 100644
--- a/deps/libfabric/contrib/cray/bats/mpi.bats
+++ b/deps/libfabric/contrib/cray/bats/omb.bats
@@ -203,199 +203,199 @@ load test_helper
 
 # XRC
 @test "osu_latency 2 ranks, 1 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/pt2pt/osu_latency
     [ "$status" -eq 0 ]
 }
 
 @test "osu_bw 2 ranks, 1 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/pt2pt/osu_bw
     [ "$status" -eq 0 ]
 }
 
 @test "osu_mbw_mr 8 ranks, 4 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 8 4) timeout 300 $OMB_BUILD_PATH/pt2pt/osu_mbw_mr
     [ "$status" -eq 0 ]
 }
 
 @test "osu_get_latency 2 ranks, 1 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/one-sided/osu_get_latency
     [ "$status" -eq 0 ]
 }
 
 @test "osu_get_bw 2 ranks, 1 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/one-sided/osu_get_bw
     [ "$status" -eq 0 ]
 }
 
 @test "osu_put_latency 2 ranks, 1 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/one-sided/osu_put_latency
     [ "$status" -eq 0 ]
 }
 
 @test "osu_put_bw 2 ranks, 1 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/one-sided/osu_put_bw
     [ "$status" -eq 0 ]
 }
 
 @test "osu_put_bibw 2 ranks, 1 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/one-sided/osu_put_bibw
     [ "$status" -eq 0 ]
 }
 
 @test "osu_allreduce 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_allreduce
     [ "$status" -eq 0 ]
 }
 
 @test "osu_allgather 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_allgather
     [ "$status" -eq 0 ]
 }
 
 @test "osu_allgatherv 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_allgatherv
     [ "$status" -eq 0 ]
 }
 
 @test "osu_alltoall 20 ranks, 5 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 20 5) timeout 300 $OMB_BUILD_PATH/collective/osu_alltoall
     [ "$status" -eq 0 ]
 }
 
 @test "osu_alltoallv 20 ranks, 5 ranks per node using XRC verbs" {
     skip "fails consistently at 128k message size"
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 20 5) timeout 300 $OMB_BUILD_PATH/collective/osu_alltoallv
     [ "$status" -eq 0 ]
 }
 
 @test "osu_barrier 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_barrier
     [ "$status" -eq 0 ]
 }
 
 @test "osu_bcast 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_bcast
     [ "$status" -eq 0 ]
 }
 
 @test "osu_gather 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_gather
     [ "$status" -eq 0 ]
 }
 
 @test "osu_gatherv 40 ranks, 10 ranks per node using XRC verbs" {
     skip "fails intermittently"
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_gatherv
     [ "$status" -eq 0 ]
 }
 
 @test "osu_iallgather 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_iallgather
     [ "$status" -eq 0 ]
 }
 
 @test "osu_iallgatherv 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_iallgatherv
     [ "$status" -eq 0 ]
 }
 
 @test "osu_ialltoall 20 ranks, 5 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 20 5) timeout 300 $OMB_BUILD_PATH/collective/osu_ialltoall
     [ "$status" -eq 0 ]
 }
 
 @test "osu_ialltoallv 20 ranks, 5 ranks per node using XRC verbs" {
     skip "fails consistently at 128k message size"
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 20 5) timeout 300 $OMB_BUILD_PATH/collective/osu_ialltoallv
     [ "$status" -eq 0 ]
 }
 
 @test "osu_ialltoallw 20 ranks, 5 ranks per node using XRC verbs" {
     skip "fails consistently at 128k message size"
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 20 5) timeout 300 $OMB_BUILD_PATH/collective/osu_ialltoallw
     [ "$status" -eq 0 ]
 }
 
 @test "osu_ibarrier 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_ibarrier
     [ "$status" -eq 0 ]
 }
 
 @test "osu_ibcast 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_ibcast
     [ "$status" -eq 0 ]
 }
 
 @test "osu_igather 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_igather
     [ "$status" -eq 0 ]
 }
 
 @test "osu_igatherv 40 ranks, 10 ranks per node using XRC verbs" {
     skip "fails intermittently"
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_igatherv
     [ "$status" -eq 0 ]
 }
 
 @test "osu_iscatter 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_iscatter
     [ "$status" -eq 0 ]
 }
 
 @test "osu_iscatterv 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_iscatterv
     [ "$status" -eq 0 ]
 }
 
 @test "osu_reduce 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_reduce
     [ "$status" -eq 0 ]
 }
 
 @test "osu_reduce_scatter 40 ranks, 10 ranks per node using XRC verbs" {
     skip "fails consistently at 512K message size"
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_reduce_scatter
     [ "$status" -eq 0 ]
 }
 
 @test "osu_scatter 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_scatter
     [ "$status" -eq 0 ]
 }
 
 @test "osu_scatterv 40 ranks, 10 ranks per node using XRC verbs" {
-    FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
+    FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \
         $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_scatterv
     [ "$status" -eq 0 ]
 }
diff --git a/deps/libfabric/contrib/cray/bin/run_libfabric_pipeline b/deps/libfabric/contrib/cray/bin/run_libfabric_pipeline
index 6383aea9ca50d3689adfe2670196e0f59e1b6637..88978f4ee12c7c8bb76516c6e0bc8d2f9dcd959e 100755
--- a/deps/libfabric/contrib/cray/bin/run_libfabric_pipeline
+++ b/deps/libfabric/contrib/cray/bin/run_libfabric_pipeline
@@ -21,9 +21,10 @@ BUILD=true
 TEST=true
 UNITTEST=true
 SMOKETEST=true
+IMBTEST=true
+OMBTEST=true
 FABTEST=true
 SFT=true
-MPI=true
 
 function usage {
     echo \
@@ -38,15 +39,16 @@ function set_sections_to_run {
     TEST=false
     UNITTEST=false
     SMOKETEST=false
+    IMBTEST=false
+    OMBTEST=false
     FABTEST=false
     SFT=false
-    MPI=false
 
     sections=$(echo $@ | tr ',' ' ')
     for section in $sections ; do
         section_name=$(echo $section | awk '{print toupper($0)}')
         case $section_name in
-            'UNITTEST'|'SMOKETEST'|'FABTEST'|'SFT'|'MPI')
+            'UNITTEST'|'SMOKETEST'|'IMBTEST'|'OMBTEST'|'FABTEST'|'SFT')
                 TEST=true
                 eval ${section_name}=true
                 ;;
@@ -55,17 +57,19 @@ function set_sections_to_run {
                 TEST=true
                 UNITTEST=true
                 SMOKETEST=true
+                IMBTEST=true
+                OMBTEST=true
                 FABTEST=true
                 SFT=true
-                MPI=true
                 ;;
             'TEST')
                 TEST=true
                 UNITTEST=true
                 SMOKETEST=true
+                IMBTEST=true
+                OMBTEST=true
                 FABTEST=true
                 SFT=true
-                MPI=true
                 ;;
              'BUILD')
                 BUILD=true
@@ -75,7 +79,7 @@ function set_sections_to_run {
         esac
     done
 
-    for each in BUILD TEST UNITTEST SMOKETEST SFT MPI ; do
+    for each in BUILD TEST UNITTEST SMOKETEST IMBTEST OMBTEST SFT ; do
         if $DEBUG ; then echo ${each} = $(eval echo \$$each) ; fi
     done
 }
@@ -127,7 +131,7 @@ verbose "CLEAN:     $CLEAN"
 verbose "SECTIONS:  $SECTIONS"
 verbose "WORKSPACE: $WORKSPACE"
 
-for each in BUILD TEST UNITTEST SMOKETEST FABTEST MPI SFT ; do
+for each in BUILD TEST UNITTEST SMOKETEST IMBTEST OMBTEST FABTEST SFT ; do
     verbose "$each: $(eval echo \$$each)"
 done
 
@@ -168,10 +172,11 @@ export ROOT_BUILD_PATH="/scratch/jenkins/builds"
 export FABTEST_PATH="${WORKSPACE}/fabtests"
 export LIBFABRIC_BUILD_PATH="${ROOT_BUILD_PATH}/libfabric"
 export OMB_BUILD_PATH="${ROOT_BUILD_PATH}/osu-micro-benchmarks/5.4.2/libexec/osu-micro-benchmarks/mpi"
-export MPICH_PATH="${ROOT_BUILD_PATH}/mpich/3.3b3"
+export MPICH_PATH="${ROOT_BUILD_PATH}/mpich/stable"
 export SFT_INSTALL_PATH="${ROOT_BUILD_PATH}/libfabric-sft/stable"
 export BATS_INSTALL_PATH="${ROOT_BUILD_PATH}/bats/stable/bin"
 export BATS_LOG_DIRECTORY="$WORKSPACE/logs"
+export IMB_BUILD_PATH="${ROOT_BUILD_PATH}/imb/v2019.6"
 # End pipeline variables
 
 # Start Prologue
@@ -228,6 +233,22 @@ $BATS_INSTALL_PATH/bats $@ -t contrib/cray/bats/smoketests.bats | tee smoketests
 section_end 'smoke tests'
 fi
 
+if $IMBTEST ; then
+section_start 'imb tests'
+## Start IMB Tests
+$BATS_INSTALL_PATH/bats $@ -t contrib/cray/bats/imb.bats | tee imb.tap
+## End IMB Tests
+section_end 'imb tests'
+fi
+
+if $OMBTEST ; then
+section_start 'omb tests'
+## Start OMB Tests
+$BATS_INSTALL_PATH/bats $@ -t contrib/cray/bats/omb.bats | tee omb.tap
+## End OMB Tests
+section_end 'omb tests'
+fi
+
 if $FABTEST ; then
 section_start 'fabtests'
 ## Start Fabtests
@@ -286,8 +307,8 @@ timeout 900 ./ci-all.sh \
     --results-file ${SFT_TEST_RESULTS_DIR}/${SFT_TEST_RESULTS_CI}
 popd
 
-cp  ./${SFT_BASELINE_DIR}/${SFT_BASELINE_RESULTS_FILE} ${SFT_TEST_RESULTS_DIR}/ \
-    ${SFT_TEST_RESULTS_EXPECTED}${SFT_BASELINE_RESULTS_FILE}
+cp  ./${SFT_BASELINE_DIR}/${SFT_BASELINE_RESULTS_FILE} \
+    ${SFT_TEST_RESULTS_DIR}/${SFT_TEST_RESULTS_EXPECTED}${SFT_BASELINE_RESULTS_FILE}
 ${SFT_BIN}/sft_parse_test_results.pm \
     -b ${SFT_TEST_RESULTS_EXPECTED}${SFT_BASELINE_RESULTS_FILE} \
     -d ${SFT_TEST_RESULTS_DIR} \
@@ -304,12 +325,4 @@ rm -rf ${SFT_TEST_RESULTS_DIR} || true
 section_end 'sft'
 fi
 
-if $MPI ; then
-section_start 'mpi'
-## Start MPI Tests
-$BATS_INSTALL_PATH/bats -t contrib/cray/bats/mpi.bats | tee mpi.tap
-## End MPI Tests
-section_end 'mpi'
-fi
-
 fi
diff --git a/deps/libfabric/contrib/intel/jenkins/Jenkinsfile b/deps/libfabric/contrib/intel/jenkins/Jenkinsfile
index 1253bfc730683888648842724d878ac0e9fb50d3..fe992a2f41865e99e0a56a4387e3274e2650abeb 100644
--- a/deps/libfabric/contrib/intel/jenkins/Jenkinsfile
+++ b/deps/libfabric/contrib/intel/jenkins/Jenkinsfile
@@ -44,48 +44,47 @@ pipeline {
         
         stage ('build-shmem') {
             steps {
-              withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) {
+              withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']){
                 sh """
                 python3.7  contrib/intel/jenkins/build.py 'shmem' --ofi_build_mode='dbg'
                 echo 'shmem benchmarks built successfully'
                 """
-                }
               }
-          }
-  
-        stage ('build OMPI_bm') {
-              steps {
-              withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) {
-                  sh """
-                  python3.7 contrib/intel/jenkins/build.py 'ompi_benchmarks' --ofi_build_mode='dbg'
-                  echo 'mpi benchmarks with ompi - built successfully'
-                 """
-                }
+            }
+        }
+        stage('build MPICH_bm') {
+            steps {
+              withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']){
+                sh """
+                python3.7 contrib/intel/jenkins/build.py 'mpich_benchmarks' --ofi_build_mode='dbg'
+                echo "mpi benchmarks with mpich - built successfully"
+                """
               }
-          }
-    
-    stage('build IMPI_bm') {
-        steps {
-          withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) {
+            }
+        }
+        stage('build IMPI_bm') {
+            steps {
+              withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']){
                 sh """
                 python3.7 contrib/intel/jenkins/build.py 'impi_benchmarks' --ofi_build_mode='dbg'
                 echo 'mpi benchmarks with impi - built successfully'
                 """
+              }
             }
-          }
-      }  
-    
-    stage('build MPICH_bm') {
-        steps {
-          withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) {
+        }  
+
+        stage ('build OMPI_bm') {
+            steps {
+              withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']){
                 sh """
-                python3.7 contrib/intel/jenkins/build.py 'mpich_benchmarks' --ofi_build_mode='dbg'
-                echo "mpi benchmarks with mpich - built successfully"
+                python3.7 contrib/intel/jenkins/build.py 'ompi_benchmarks' --ofi_build_mode='dbg'
+                echo 'mpi benchmarks with ompi - built successfully'
                 """
               }
             }
         }
-   stage('parallel-tests') {
+    
+        stage('parallel-tests') {
             parallel { 
                  stage('eth-tcp-dbg') {
                     agent {node {label 'eth'}}
diff --git a/deps/libfabric/contrib/intel/jenkins/build.py b/deps/libfabric/contrib/intel/jenkins/build.py
index ed26374cbdc2187288d16b2b7ab5201900c3b712..b46e17cb205d949d2dbe187a532194b801792bd9 100755
--- a/deps/libfabric/contrib/intel/jenkins/build.py
+++ b/deps/libfabric/contrib/intel/jenkins/build.py
@@ -10,6 +10,7 @@ import subprocess
 import shlex
 import common
 import re
+import shutil
 
 def build_libfabric(libfab_install_path, mode):
 
@@ -87,12 +88,13 @@ def build_shmem(shmem_dir, libfab_install_path):
 def build_ISx(shmem_dir):
     
     oshcc = '{}/bin/oshcc'.format(shmem_dir)
+    tmp_isx_src = '{}/ISx'.format(ci_site_config.shmem_root)
+    shutil.copytree(tmp_isx_src, '{}/ISx'.format(shmem_dir)) 
+    #os.chdir(shmem_dir)
+    #git_cmd = ['git', 'clone', '--depth', '1', 'https://github.com/ParRes/ISx.git', 'ISx']
     
-    os.chdir(shmem_dir)
-    git_cmd = ['git', 'clone', '--depth', '1', 'https://github.com/ParRes/ISx.git', 'ISx']
-    
-    common.run_command(git_cmd) 
-    os.chdir('ISx/SHMEM')
+    #common.run_command(git_cmd) 
+    os.chdir('{}/ISx/SHMEM'.format(shmem_dir))
     common.run_command(['make', 'CC={}'.format(oshcc), 'LDLIBS=-lm']) 
                   
     
@@ -100,10 +102,12 @@ def build_PRK(shmem_dir):
     
     oshcc = '{}/bin/oshcc'.format(shmem_dir)
     shmem_src = '{}/SOS'.format(shmem_dir)
-    os.chdir(shmem_dir)
-    git_cmd = ['git', 'clone', '--depth', ' 1', 'https://github.com/ParRes/Kernels.git', 'PRK']
-    common.run_command(git_cmd)
-    os.chdir('PRK')
+    tmp_prk_src = '{}/PRK'.format(ci_site_config.shmem_root)
+    shutil.copytree(tmp_prk_src, '{}/PRK'.format(shmem_dir))
+    #os.chdir(shmem_dir)
+    #git_cmd = ['git', 'clone', '--depth', ' 1', 'https://github.com/ParRes/Kernels.git', 'PRK']
+    #common.run_command(git_cmd)
+    os.chdir('{}/PRK'.format(shmem_dir))
     with open('common/make.defs','w') as f:
         f.write('SHMEMCC={} -std=c99\nSHMEMTOP={}\n'.format(oshcc,shmem_src))
 
@@ -112,12 +116,12 @@ def build_PRK(shmem_dir):
 def build_uh(shmem_dir):
     oshcc_bin = "{}/bin".format(shmem_dir)
     os.environ["PATH"] += os.pathsep + oshcc_bin
-   
-   
-    os.chdir(shmem_dir) 
-    git_cmd = ['git', 'clone', '--depth', '1', 'https://github.com/openshmem-org/tests-uh.git', 'tests-uh'] 
-    common.run_command(git_cmd)
-    os.chdir('tests-uh')
+    tmp_uh_src = '{}/tests-uh'.format(ci_site_config.shmem_root)
+    shutil.copytree(tmp_uh_src, '{}/tests-uh'.format(shmem_dir))
+    #os.chdir(shmem_dir) 
+    #git_cmd = ['git', 'clone', '--depth', '1', 'https://github.com/openshmem-org/tests-uh.git', 'tests-uh'] 
+    #common.run_command(git_cmd)
+    os.chdir('{}/tests-uh'.format(shmem_dir))
     common.run_command(['make', '-j4', 'C_feature_tests'])
     
 
@@ -138,13 +142,30 @@ def build_mpi(mpi, mpisrc, mpi_install_path, libfab_install_path,  ofi_build_mod
         cmd.append("--enable-fortran=no")
         cmd.append("--with-device=ch4:ofi")
         cmd.append("--enable-ch4-direct=netmod")
-
         
     configure_cmd = shlex.split(" ".join(cmd))
     common.run_command(configure_cmd)
     common.run_command(["make", "clean"])
     common.run_command(["make", "install", "-j32"])
 
+def build_mpich_suite(mpi, mpi_install_path, libfab_install_path):
+
+    mpich_suite_path = '{}/test/'.format(ci_site_config.mpich_src)
+    mpichsuite_installpath= "{}/mpichsuite/test".format(mpi_install_path)
+    pwd = os.getcwd()
+    if (mpi == 'impi'):
+        os.chdir("{}/mpi".format(mpich_suite_path))
+        cmd = ["./configure", "--with-mpi={}/intel64" \
+               .format(ci_site_config.impi_root)]
+
+        configure_cmd = shlex.split(" ".join(cmd))
+        common.run_command(configure_cmd)
+        common.run_command(["make", "all","-j32"])
+        shutil.copytree(mpich_suite_path, mpichsuite_installpath)
+        common.run_command(["make", "distclean"])
+        os.chdir(pwd)
+
+
 
 def build_stress_bm(mpi, mpi_install_path, libfab_install_path):
     
@@ -247,7 +268,9 @@ if __name__ == "__main__":
                      else ci_site_config.ompi_src
             # only need to build ompi or mpich, impi is available as binary
             build_mpi(mpi, mpisrc, mpi_install_path, install_path, ofi_build_mode)
-                           
+        
+	# build mpich_test_suite
+        build_mpich_suite(mpi, mpi_install_path, install_path)
         # run stress and osu benchmarks for all mpitypes
         build_stress_bm(mpi, mpi_install_path, install_path)
         build_osu_bm(mpi, mpi_install_path, install_path)
diff --git a/deps/libfabric/contrib/intel/jenkins/run.py b/deps/libfabric/contrib/intel/jenkins/run.py
index 683da13134907b826fb4edb291e25032896ed4e9..7873f0f1e77a2a0bc2f7072b9d651a7c0e88c1ad 100755
--- a/deps/libfabric/contrib/intel/jenkins/run.py
+++ b/deps/libfabric/contrib/intel/jenkins/run.py
@@ -75,7 +75,19 @@ def intel_mpi_benchmark(core, hosts, mpi, mode, util=None):
         print("skipping {} as execute condition fails"\
                     .format(imb_test.testname))
     print("----------------------------------------------------------------------------------------\n")
-    
+
+#mpich_test_suite
+def mpich_test_suite(core, hosts, mpi, mode, util=None):
+    mpich_tests = tests.MpichTestSuite(jobname=jbname,buildno=bno,\
+                  testname="MpichTestSuite",core_prov=core, fabric=fab,\
+                  mpitype=mpi, hosts=hosts, ofi_build_mode=mode, \
+                  util_prov=util)
+    if (mpich_tests.execute_condn == True and \
+        mpich_tests.mpi_gen_execute_condn == True):
+        print("Running mpich test suite: Spawn coll, comm, dt Tests for {}-{}-{}-{}".format(core, util, fab, mpi))
+        os.environ["MPITEST_RETURN_WITH_CODE"] = "1"
+        mpich_tests.execute_cmd("spawn")
+ 
 #mpi_stress benchmark tests
 def mpistress_benchmark(core, hosts, mpi, mode, util=None):
 
diff --git a/deps/libfabric/contrib/intel/jenkins/runtests.py b/deps/libfabric/contrib/intel/jenkins/runtests.py
index a7a1b1b2f4da0731134b12b8f90d117d800a2c6f..c3cfe285d8f13c27f9a6ffcee4cae42a295ac2d8 100755
--- a/deps/libfabric/contrib/intel/jenkins/runtests.py
+++ b/deps/libfabric/contrib/intel/jenkins/runtests.py
@@ -50,6 +50,7 @@ if(args_core):
         run.fabtests(args_core, hosts, ofi_build_mode)
         run.shmemtest(args_core, hosts, ofi_build_mode)
         for mpi in mpilist:
+            run.mpich_test_suite(args_core, hosts, mpi, ofi_build_mode)
             run.intel_mpi_benchmark(args_core, hosts, mpi, ofi_build_mode)   
             run.mpistress_benchmark(args_core, hosts, mpi, ofi_build_mode)
             run.osu_benchmark(args_core, hosts, mpi, ofi_build_mode)  
@@ -58,10 +59,13 @@ if(args_core):
         run.fabtests(args_core, hosts, ofi_build_mode, util=args_util)
         run.shmemtest(args_core, hosts, ofi_build_mode, util=args_util)
         for mpi in mpilist:
+            run.mpich_test_suite(args_core, hosts, mpi, ofi_build_mode, \
+                                 util=args_util)
+
             run.intel_mpi_benchmark(args_core, hosts, mpi, ofi_build_mode, \
-                                        util=args_util,)
+                                    util=args_util)
             run.mpistress_benchmark(args_core, hosts, mpi, ofi_build_mode, \
-                                            util=args_util)
+                                    util=args_util)
             run.osu_benchmark(args_core, hosts, mpi, ofi_build_mode, \
                                              util=args_util)
 else:
diff --git a/deps/libfabric/contrib/intel/jenkins/tests.py b/deps/libfabric/contrib/intel/jenkins/tests.py
index ac9ecc03a6635c6a7d0e8eafc745481956ac0223..c9cb310286572860fba82348a98df49ee02fab51 100755
--- a/deps/libfabric/contrib/intel/jenkins/tests.py
+++ b/deps/libfabric/contrib/intel/jenkins/tests.py
@@ -140,8 +140,7 @@ class Fabtest(Test):
         return opts
    
     @property
-    def execute_condn(self):
-        # fabtests works for shmem prov only for libfabric debug builds.
+    def execute_condn(self):     
         return True if (self.core_prov != 'shm' or \
                         self.ofi_build_mode == 'dbg') else False
 
@@ -207,7 +206,6 @@ class MpiTests(Test):
                          fabric, hosts, ofi_build_mode, util_prov)
         self.mpi = mpitype
 
-
     @property
     def cmd(self):
         if (self.mpi == "impi" or self.mpi == "mpich"):
@@ -358,7 +356,77 @@ class MpiTestIMB(MpiTests):
             outputcmd = shlex.split(command + self.rma.imb_cmd)
             common.run_command(outputcmd)
 
+class MpichTestSuite(MpiTests):
+    
+    def __init__(self, jobname, buildno, testname, core_prov, fabric, 
+		     mpitype, hosts, ofi_build_mode, util_prov=None):
+            super().__init__(jobname, buildno, testname, core_prov, fabric,
+			     mpitype,  hosts, ofi_build_mode, util_prov)
+            self.mpichsuitepath =  "{}/{}/mpichsuite/test/mpi/" \
+                                   .format(self.libfab_installpath, self.mpi)
+            self.pwd = os.getcwd()
+
+    def testgroup(self, testgroupname):
         
+        testpath = "{}/{}".format(self.mpichsuitepath, testgroupname)
+        tests = []
+        with open("{}/testlist".format(testpath)) as file:
+            for line in file:
+                if(line[0] != '#' and  line[0] != '\n'):
+                    tests.append((line.rstrip('\n')).split(' '))
+	
+        return tests
+
+    def options(self, nprocs, timeout=None):
+        if (self.mpi == "impi" or self.mpi == "mpich"):
+            if (self.mpi == "impi"):
+                mpiroot = ci_site_config.impi_root
+            else:
+                mpiroot = "{}/mpich".format(self.libfab_installpath)
+            if (self.util_prov):
+                prov = "\"{};{}\"".format(self.core_prov, self.util_prov)
+            else:
+                prov = self.core_prov
+
+            if (timeout != None):
+                os.environ['MPIEXEC_TIMEOUT']=timeout
+
+            opts = "-n {np} -hosts {s},{c} -mpi_root={mpiroot} \
+                    -libfabric_path={installpath}/lib -prov {provider} "  \
+                    .format(np=nprocs, s=self.server, c=self.client, \
+                            provider=prov, mpiroot=mpiroot, \
+                            installpath=self.libfab_installpath)
+
+        elif (self.mpi == "ompi"):
+            print(self.mpi)
+
+        return opts
+
+    @property
+    def execute_condn(self):
+        return True if (self.mpi == 'impi' and  self.core_prov != 'psm2' \
+                        and self.core_prov != 'sockets') else False
+ 
+    def execute_cmd(self, testgroupname):
+        print("Running Tests: " + testgroupname)
+        tests = []
+        time = None
+        os.chdir("{}/{}".format(self.mpichsuitepath,testgroupname))
+        tests = self.testgroup(testgroupname)
+        for test in tests:
+            testname = test[0]
+            nprocs = test[1]
+            args = test[2:]
+            for item in args:
+               itemlist =  item.split('=')
+               if (itemlist[0] == 'timelimit'):
+                   time = itemlist[1]
+            opts = self.options(nprocs, timeout=time)
+            testcmd = self.cmd + opts +"./{}".format(testname)
+            outputcmd = shlex.split(testcmd)
+            common.run_command(outputcmd)
+        os.chdir(self.pwd)
+
 class MpiTestStress(MpiTests):
      
     def __init__(self, jobname, buildno, testname, core_prov, fabric, 
diff --git a/deps/libfabric/docs/policy b/deps/libfabric/docs/policy
index 5cf066c143b4511b6e32c4c29f30947ab85a3c4a..25c860ea94521e77f8bede7d4fbf371dad9035f4 100644
--- a/deps/libfabric/docs/policy
+++ b/deps/libfabric/docs/policy
@@ -4,7 +4,13 @@ This document describes the general policies and procedures that
 are followed by the libfabric development community.  It is best
 viewed as a guideline, and is not a formal or legal document.
 
-Code contributions
+
+DEVELOPER GUIDELINES
+====================
+The following guidelines are helpful for developers new to the
+libfabric community and open source development.
+
+Code Contributions
 ------------------
 Any developers wishing to contribute to libfabric may do so,
 provided that they adhere to the CONTRIBUTORS agreement in the root
@@ -13,31 +19,6 @@ or documents that must be signed prior to submitting code.  Developers
 need the rights to submit the code being introduced, and the code
 must meet the license requirements of the project.
 
-Git Repository Admin
---------------------
-The number of people with administrative access to the github repo
-will be limited.  Traditionally, this has been around three developers who
-are active in the project, and are from different companies.  Admins
-will typically have the same limitations as those with write access to
-the repo, such as no forced updates.
-
-Git Write Access
-----------------
-Because of the scope of the project, there may be several people (more
-than 10) with write access.  Most writers are maintainers for a
-specific provider in the project.  As a general rule, writers should only
-commit changes to the subdirectory that corresponds with the provider
-that they are maintaining.  Changes made to other providers or the
-libfabric core must be approved prior by the relevant owners prior to
-being merged.
-
-Core Changes
-------------
-Updates to the libfabric core should be reviewed by at least one other
-developer.  Changes to the API should be brought to the attention of
-the OFIWG mailing list, with significant changes discussed prior to
-being implemented.
-
 Patch Submission
 ----------------
 Patches should be submitted directly to github as part of a pull request.
@@ -45,6 +26,43 @@ For patches that touch the external API or introduce or modify core
 functionality significantly, an email should be sent to the ofiwg mail
 list with a link to the pull request.
 
+Patches should include a clear description of the problem that the patch
+is addressing, and how it does so.  One or two line descriptions are
+almost never sufficient, except for the most trivial code changes.
+The description should stand on its own, and provide enough context
+for someone to determine what the patch does, without needing to read
+the accompanying code changes.  Often times, the purpose of a patch is
+made clearer as part of a review discussion.  When this occurs, the
+portion of the discussion clarifying the purpose of a change should be
+folded into the patch description.
+
+Each patch should address a single problem.  When a patch description
+indicates that a patch does A, B, and C, that's usually the indication
+that the patch should have been split into three separate patches.
+An exception may be made if an unrelated change occurs in the code that
+surrounds the patch, provided that the change is trivial.  For example,
+white space cleanup or fixing typos in comments may be allowed to slip
+through the review process, even though those changes are unrelated to
+the patch.
+
+No single patch should ever break the build or result in incorrect operation.
+That is, arbitrarily breaking up a patch into two or more pieces, which all
+need to be applied to bring the repository back into a stable state is not
+allowed.
+
+One of the most common reasons that a patch is rejected is that it is
+trying to change too many things at once.  The standard argument back is
+that developer viewed the entire set of changes as one entity.  The
+best chance of having code accepted with minimal changes requested is
+to keep patches small.  If a large set of changes requires restructuring
+the existing code, then separate out the restructuring into its own set
+of patches.  It's okay for a patch to do nothing significant other than
+prepare the code for a follow on patch.  In fact, it's often preferred,
+as that can help identify alternatives that weren't considered.
+
+For help on how to write a good patch and patch description, search the
+web.  There are plenty of helpful tutorials out there.
+
 Pull Requests
 -------------
 A number of continuous integration tests run against all pull requests.
@@ -57,6 +75,44 @@ may be ignored, and the pull request merged.  It is the responsibility
 of the person committing the request to the repo to confirm that any
 CI failures are unrelated to the changes in the pull request.
 
+Core Changes
+------------
+Updates to the libfabric core should be reviewed by at least one other
+developer.  Changes to the API should be brought to the attention of
+the OFIWG mailing list, with significant changes discussed prior to
+being implemented.
+
+API Changes
+-----------
+All files under the include/rdma subdirectory are maintained as part of
+the stable libfabric API.  Any changes to those files will receive a
+strongly scrutinized review, as changes there have a much broader impact
+across not just the project, but the entire libfabric software ecosystem.
+For additional details, see include/ofi_abi.h before deciding that you
+really don't need that API change. :)
+
+
+PROJECT ADMINISTRATION
+======================
+
+Git Repository Admin
+--------------------
+The number of people with administrative access to the github repo
+will be limited.  Traditionally, this has been around three developers who
+are active in the project, and are from different companies.  Admins
+will typically have the same limitations as those with write access to
+the repo, such as no forced updates.
+
+Git Write Access
+----------------
+Because of the scope of the project, there may be several people (more
+than 10) with write access.  Most writers are maintainers for a
+specific provider in the project.  As a general rule, writers should only
+commit changes to the subdirectory that corresponds with the provider
+that they are maintaining.  Changes made to other providers or the
+libfabric core must be approved prior by the relevant owners prior to
+being merged.
+
 Releases
 --------
 A wiki page maintained on github with the repo provides a full checklist
diff --git a/deps/libfabric/fabtests/Makefile.am b/deps/libfabric/fabtests/Makefile.am
index 359319792ac0e42e573ad77b2ce5aad841810030..f60c81e716988c3e2348337ff2ebb85b1509a0a1 100644
--- a/deps/libfabric/fabtests/Makefile.am
+++ b/deps/libfabric/fabtests/Makefile.am
@@ -4,6 +4,7 @@ ACLOCAL_AMFLAGS = -I config
 
 if MACOS
 os_excludes = -f ./test_configs/osx.exclude
+AM_CFLAGS += -I$(srcdir)/include/osx
 endif
 
 if FREEBSD
@@ -52,6 +53,7 @@ bin_PROGRAMS = \
 	unit/fi_eq_test \
 	unit/fi_cq_test \
 	unit/fi_mr_test \
+	unit/fi_mr_cache_evict \
 	unit/fi_cntr_test \
 	unit/fi_av_test \
 	unit/fi_dom_test \
@@ -108,7 +110,12 @@ noinst_LTLIBRARIES = libfabtests.la
 libfabtests_la_SOURCES = \
 	common/shared.c \
 	common/jsmn.c \
+	common/hmem.c \
+	common/hmem_cuda.c \
+	common/hmem_rocr.c \
+	common/hmem_ze.c \
 	include/shared.h \
+	include/hmem.h \
 	include/jsmn.h \
 	include/unix/osd.h \
 	include/ft_osd.h
@@ -303,6 +310,11 @@ unit_fi_mr_test_SOURCES = \
 	$(unit_srcs)
 unit_fi_mr_test_LDADD = libfabtests.la
 
+unit_fi_mr_cache_evict_SOURCES = \
+	unit/mr_cache_evict.c \
+	$(unit_srcs)
+unit_fi_mr_cache_evict_LDADD = libfabtests.la
+
 unit_fi_cntr_test_SOURCES = \
 	unit/cntr_test.c \
 	$(unit_srcs)
diff --git a/deps/libfabric/fabtests/Makefile.win b/deps/libfabric/fabtests/Makefile.win
index 9be5046ffffed1aec1042b5ec71a56c26e59e192..b1ed3b978f5938b025dc79ed17f16f0b2ce798f0 100644
--- a/deps/libfabric/fabtests/Makefile.win
+++ b/deps/libfabric/fabtests/Makefile.win
@@ -27,8 +27,9 @@ outdir = $(output_root)$(arch)\release-v141
 CFLAGS = $(CFLAGS) /O2 /MT
 !endif
 
-basedeps = common\shared.c common\jsmn.c common\windows\getopt.c \
-	common\windows\osd.c
+basedeps = common\hmem.c common\shared.c common\jsmn.c \
+	common\windows\getopt.c common\windows\osd.c \
+	common\hmem_cuda.c common\hmem_rocr.c common\hmem_ze.c
 
 includes = /Iinclude /Iinclude\windows /I..\include /FIft_osd.h \
 	/Iinclude\windows\getopt
diff --git a/deps/libfabric/fabtests/benchmarks/msg_bw.c b/deps/libfabric/fabtests/benchmarks/msg_bw.c
index f273d6a1f54b849e0c7ccf7663d0f607c43560de..2e36d0ff1d3e4fc83dcb9c8fcf5794a1a40177f8 100644
--- a/deps/libfabric/fabtests/benchmarks/msg_bw.c
+++ b/deps/libfabric/fabtests/benchmarks/msg_bw.c
@@ -107,6 +107,7 @@ int main(int argc, char **argv)
 	hints->domain_attr->resource_mgmt = FI_RM_ENABLED;
 	hints->domain_attr->mr_mode = opts.mr_mode;
 	hints->domain_attr->threading = FI_THREAD_DOMAIN;
+	hints->addr_format = opts.address_format;
 
 	ret = run();
 
diff --git a/deps/libfabric/fabtests/benchmarks/msg_pingpong.c b/deps/libfabric/fabtests/benchmarks/msg_pingpong.c
index ef342eae8988046690d6526bc15d72983ee8eef6..a03864724290947c3e590b08576f44a026b6cf21 100644
--- a/deps/libfabric/fabtests/benchmarks/msg_pingpong.c
+++ b/deps/libfabric/fabtests/benchmarks/msg_pingpong.c
@@ -107,6 +107,7 @@ int main(int argc, char **argv)
 	hints->caps = FI_MSG;
 	hints->domain_attr->mr_mode = opts.mr_mode;
 	hints->domain_attr->threading = FI_THREAD_DOMAIN;
+	hints->addr_format = opts.address_format;
 
 	ret = run();
 
diff --git a/deps/libfabric/fabtests/benchmarks/rma_bw.c b/deps/libfabric/fabtests/benchmarks/rma_bw.c
index e4351c89bb2c764073d2593de6c8a9e80439349f..a8ace33bcc1c81af2f085e1c7e3d4b323fecd218 100644
--- a/deps/libfabric/fabtests/benchmarks/rma_bw.c
+++ b/deps/libfabric/fabtests/benchmarks/rma_bw.c
@@ -95,6 +95,7 @@ int main(int argc, char **argv)
 	hints->domain_attr->resource_mgmt = FI_RM_ENABLED;
 	hints->mode = FI_CONTEXT;
 	hints->domain_attr->threading = FI_THREAD_DOMAIN;
+	hints->addr_format = opts.address_format;
 
 	while ((op = getopt(argc, argv, "ho:" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) {
 		switch (op) {
diff --git a/deps/libfabric/fabtests/common/hmem.c b/deps/libfabric/fabtests/common/hmem.c
new file mode 100644
index 0000000000000000000000000000000000000000..8736817e5bb39674d4bf39a7b3999a0fe5fc67de
--- /dev/null
+++ b/deps/libfabric/fabtests/common/hmem.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2020 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under the BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ */
+
+#if HAVE_CONFIG_H
+	#include <config.h>
+#endif
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include "hmem.h"
+
+static bool hmem_initialized = false;
+
+struct ft_hmem_ops {
+	int (*init)(void);
+	int (*cleanup)(void);
+	int (*alloc)(uint64_t device, void **buf, size_t size);
+	int (*free)(void *buf);
+	int (*memset)(uint64_t device, void *buf, int value, size_t size);
+	int (*copy_to_hmem)(uint64_t device, void *dst, const void *src,
+			    size_t size);
+	int (*copy_from_hmem)(uint64_t device, void *dst, const void *src,
+			      size_t size);
+};
+
+static struct ft_hmem_ops hmem_ops[] = {
+	[FI_HMEM_SYSTEM] = {
+		.init = ft_host_init,
+		.cleanup = ft_host_cleanup,
+		.alloc = ft_host_alloc,
+		.free = ft_host_free,
+		.memset = ft_host_memset,
+		.copy_to_hmem = ft_host_memcpy,
+		.copy_from_hmem = ft_host_memcpy,
+	},
+	[FI_HMEM_CUDA] = {
+		.init = ft_cuda_init,
+		.cleanup = ft_cuda_cleanup,
+		.alloc = ft_cuda_alloc,
+		.free = ft_cuda_free,
+		.memset = ft_cuda_memset,
+		.copy_to_hmem = ft_cuda_copy_to_hmem,
+		.copy_from_hmem = ft_cuda_copy_from_hmem,
+	},
+	[FI_HMEM_ROCR] = {
+		.init = ft_rocr_init,
+		.cleanup = ft_rocr_cleanup,
+		.alloc = ft_rocr_alloc,
+		.free = ft_rocr_free,
+		.memset = ft_rocr_memset,
+		.copy_to_hmem = ft_rocr_memcpy,
+		.copy_from_hmem = ft_rocr_memcpy,
+	},
+	[FI_HMEM_ZE] = {
+		.init = ft_ze_init,
+		.cleanup = ft_ze_cleanup,
+		.alloc = ft_ze_alloc,
+		.free = ft_ze_free,
+		.memset = ft_ze_memset,
+		.copy_to_hmem = ft_ze_copy,
+		.copy_from_hmem = ft_ze_copy,
+	},
+};
+
+int ft_hmem_init(enum fi_hmem_iface iface)
+{
+	int ret;
+
+	ret = hmem_ops[iface].init();
+	if (ret == FI_SUCCESS)
+		hmem_initialized = true;
+
+	return ret;
+}
+
+int ft_hmem_cleanup(enum fi_hmem_iface iface)
+{
+	int ret = FI_SUCCESS;
+
+	if (hmem_initialized) {
+		ret = hmem_ops[iface].cleanup();
+		if (ret == FI_SUCCESS)
+			hmem_initialized = false;
+	}
+
+	return ret;
+}
+
+int ft_hmem_alloc(enum fi_hmem_iface iface, uint64_t device, void **buf,
+		  size_t size)
+{
+	return hmem_ops[iface].alloc(device, buf, size);
+}
+
+int ft_hmem_free(enum fi_hmem_iface iface, void *buf)
+{
+	return hmem_ops[iface].free(buf);
+}
+
+int ft_hmem_memset(enum fi_hmem_iface iface, uint64_t device, void *buf,
+		   int value, size_t size)
+{
+	return hmem_ops[iface].memset(device, buf, value, size);
+}
+
+int ft_hmem_copy_to(enum fi_hmem_iface iface, uint64_t device, void *dst,
+		    const void *src, size_t size)
+{
+	return hmem_ops[iface].copy_to_hmem(device, dst, src, size);
+}
+
+int ft_hmem_copy_from(enum fi_hmem_iface iface, uint64_t device, void *dst,
+		      const void *src, size_t size)
+{
+	return hmem_ops[iface].copy_from_hmem(device, dst, src, size);
+}
diff --git a/deps/libfabric/fabtests/common/hmem_cuda.c b/deps/libfabric/fabtests/common/hmem_cuda.c
new file mode 100644
index 0000000000000000000000000000000000000000..ea3cfb7895400e8b25457f3d5d8c9b33d03ff795
--- /dev/null
+++ b/deps/libfabric/fabtests/common/hmem_cuda.c
@@ -0,0 +1,224 @@
+/*
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "hmem.h"
+#include "shared.h"
+
+#ifdef HAVE_CUDA_RUNTIME_H
+
+#include <dlfcn.h>
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+struct cuda_ops {
+	cudaError_t (*cudaMemcpy)(void *dst, const void *src, size_t count,
+				  enum cudaMemcpyKind kind);
+	cudaError_t (*cudaMalloc)(void **ptr, size_t size);
+	cudaError_t (*cudaFree)(void *ptr);
+	cudaError_t (*cudaMemset)(void *ptr, int value, size_t count);
+	const char *(*cudaGetErrorName)(cudaError_t error);
+	const char *(*cudaGetErrorString)(cudaError_t error);
+};
+
+static struct cuda_ops cuda_ops;
+static void *cudart_handle;
+
+#define CUDA_ERR(err, fmt, ...) \
+	FT_ERR(fmt ": %s %s", ##__VA_ARGS__, cuda_ops.cudaGetErrorName(err), \
+	       cuda_ops.cudaGetErrorString(err))
+
+int ft_cuda_init(void)
+{
+	cudart_handle = dlopen("libcudart.so", RTLD_NOW);
+	if (!cudart_handle) {
+		FT_ERR("Failed to dlopen libcudart.so");
+		goto err;
+	}
+
+	cuda_ops.cudaMemcpy = dlsym(cudart_handle, "cudaMemcpy");
+	if (!cuda_ops.cudaMemcpy) {
+		FT_ERR("Failed to find cudaMemcpy");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaMalloc = dlsym(cudart_handle, "cudaMalloc");
+	if (!cuda_ops.cudaMalloc) {
+		FT_ERR("Failed to find cudaMalloc");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaFree = dlsym(cudart_handle, "cudaFree");
+	if (!cuda_ops.cudaFree) {
+		FT_ERR("Failed to find cudaFree");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaMemset = dlsym(cudart_handle, "cudaMemset");
+	if (!cuda_ops.cudaMemset) {
+		FT_ERR("Failed to find cudaMemset");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaGetErrorName = dlsym(cudart_handle, "cudaGetErrorName");
+	if (!cuda_ops.cudaGetErrorName) {
+		FT_ERR("Failed to find cudaGetErrorName");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaGetErrorString = dlsym(cudart_handle,
+					    "cudaGetErrorString");
+	if (!cuda_ops.cudaGetErrorString) {
+		FT_ERR("Failed to find cudaGetErrorString");
+		goto err_dlclose_cuda;
+	}
+
+	return FI_SUCCESS;
+
+err_dlclose_cuda:
+	dlclose(cudart_handle);
+err:
+	return -FI_ENODATA;
+}
+
+int ft_cuda_cleanup(void)
+{
+	dlclose(cudart_handle);
+	return FI_SUCCESS;
+}
+
+int ft_cuda_alloc(uint64_t device, void **buf, size_t size)
+{
+	cudaError_t cuda_ret;
+
+	cuda_ret = cuda_ops.cudaMalloc(buf, size);
+	if (cuda_ret == cudaSuccess)
+		return FI_SUCCESS;
+
+	CUDA_ERR(cuda_ret, "cudaMalloc failed");
+
+	return -FI_ENOMEM;
+}
+
+int ft_cuda_free(void *buf)
+{
+	cudaError_t cuda_ret;
+
+	cuda_ret = cuda_ops.cudaFree(buf);
+	if (cuda_ret == cudaSuccess)
+		return FI_SUCCESS;
+
+	CUDA_ERR(cuda_ret, "cudaFree failed");
+
+	return -FI_EIO;
+}
+
+int ft_cuda_memset(uint64_t device, void *buf, int value, size_t size)
+{
+	cudaError_t cuda_ret;
+
+	cuda_ret = cuda_ops.cudaMemset(buf, value, size);
+	if (cuda_ret == cudaSuccess)
+		return FI_SUCCESS;
+
+	CUDA_ERR(cuda_ret, "cudaMemset failed");
+
+	return -FI_EIO;
+}
+
+int ft_cuda_copy_to_hmem(uint64_t device, void *dst, const void *src,
+			 size_t size)
+{
+	cudaError_t cuda_ret;
+
+	cuda_ret = cuda_ops.cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
+	if (cuda_ret == cudaSuccess)
+		return FI_SUCCESS;
+
+	CUDA_ERR(cuda_ret, "cudaMemcpy failed");
+
+	return -FI_EIO;
+}
+
+int ft_cuda_copy_from_hmem(uint64_t device, void *dst, const void *src,
+			   size_t size)
+{
+	cudaError_t cuda_ret;
+
+	cuda_ret = cuda_ops.cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost);
+	if (cuda_ret == cudaSuccess)
+		return FI_SUCCESS;
+
+	CUDA_ERR(cuda_ret, "cudaMemcpy failed");
+
+	return -FI_EIO;
+}
+
+#else
+
+int ft_cuda_init(void)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_cuda_cleanup(void)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_cuda_alloc(uint64_t device, void **buf, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_cuda_free(void *buf)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_cuda_memset(uint64_t device, void *buf, int value, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_cuda_copy_to_hmem(uint64_t device, void *dst, const void *src,
+			 size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_cuda_copy_from_hmem(uint64_t device, void *dst, const void *src,
+			   size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+#endif /* HAVE_CUDA_RUNTIME_H */
diff --git a/deps/libfabric/fabtests/common/hmem_rocr.c b/deps/libfabric/fabtests/common/hmem_rocr.c
new file mode 100644
index 0000000000000000000000000000000000000000..58b11e6bbf8ceabb05a8ebb162ff3846691b60d2
--- /dev/null
+++ b/deps/libfabric/fabtests/common/hmem_rocr.c
@@ -0,0 +1,324 @@
+/*
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "hmem.h"
+#include "shared.h"
+
+#ifdef HAVE_ROCR_RUNTIME_H
+
+#include <dlfcn.h>
+#include <stdio.h>
+#include <hsa/hsa.h>
+
+struct rocr_ops {
+	hsa_status_t (*hsa_memory_copy)(void *dst, const void *src,
+					size_t size);
+	hsa_status_t (*hsa_init)(void);
+	hsa_status_t (*hsa_shut_down)(void);
+	hsa_status_t (*hsa_status_string)(hsa_status_t status,
+					  const char **status_string);
+	hsa_status_t (*hsa_agent_get_info)(hsa_agent_t agent,
+					   hsa_agent_info_t attribute,
+					   void *value);
+	hsa_status_t (*hsa_region_get_info)(hsa_region_t region,
+					    hsa_region_info_t attribute,
+					    void *value);
+	hsa_status_t (*hsa_iterate_agents)
+		(hsa_status_t (*cb)(hsa_agent_t agent, void* data), void *data);
+	hsa_status_t (*hsa_agent_iterate_regions)
+		(hsa_agent_t agent,
+		 hsa_status_t (*cb)(hsa_region_t region, void* data),
+		 void *data);
+	hsa_status_t (*hsa_memory_allocate)(hsa_region_t region, size_t size,
+					    void **ptr);
+	hsa_status_t (*hsa_memory_free)(void *ptr);
+};
+
+static struct rocr_ops rocr_ops;
+static void *rocr_handle;
+
+static const char *hsa_status_to_string(hsa_status_t status)
+{
+	const char *str;
+	hsa_status_t hsa_ret;
+
+	hsa_ret = rocr_ops.hsa_status_string(status, &str);
+	if (hsa_ret != HSA_STATUS_SUCCESS)
+		return "unknown error";
+
+	return str;
+}
+
+#define ROCR_ERR(err, fmt, ...) \
+	FT_ERR(fmt ": %s", ##__VA_ARGS__, hsa_status_to_string(err))
+
+static hsa_agent_t gpu_agent;
+static hsa_region_t gpu_region;
+
+static hsa_status_t agent_cb(hsa_agent_t agent, void *data)
+{
+	hsa_status_t hsa_ret;
+	hsa_device_type_t hsa_dev_type;
+
+	hsa_ret = rocr_ops.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE,
+					      (void *) &hsa_dev_type);
+
+	if (hsa_ret == HSA_STATUS_SUCCESS &&
+	    hsa_dev_type == HSA_DEVICE_TYPE_GPU) {
+		gpu_agent = agent;
+		return HSA_STATUS_INFO_BREAK;
+	}
+
+	return hsa_ret;
+}
+
+static hsa_status_t region_cb(hsa_region_t region, void *data)
+{
+	hsa_status_t hsa_ret;
+	hsa_region_segment_t hsa_segment;
+
+	hsa_ret = rocr_ops.hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT,
+					       &hsa_segment);
+
+	if (hsa_ret == HSA_STATUS_SUCCESS &&
+	    hsa_segment == HSA_REGION_SEGMENT_GLOBAL) {
+		gpu_region = region;
+		return HSA_STATUS_INFO_BREAK;
+	}
+
+	return hsa_ret;
+}
+
+int ft_rocr_init(void)
+{
+	hsa_status_t hsa_ret;
+
+	rocr_handle = dlopen("libhsa-runtime64.so", RTLD_NOW);
+	if (!rocr_handle) {
+		FT_ERR("Failed to dlopen libhsa-runtime64.so");
+		goto err;
+	}
+
+	rocr_ops.hsa_memory_copy = dlsym(rocr_handle, "hsa_memory_copy");
+	if (!rocr_ops.hsa_memory_copy) {
+		FT_ERR("Failed to find hsa_memory_copy");
+		goto err_dlclose_rocr;
+	}
+
+	rocr_ops.hsa_init = dlsym(rocr_handle, "hsa_init");
+	if (!rocr_ops.hsa_init) {
+		FT_ERR("Failed to find hsa_init");
+		goto err_dlclose_rocr;
+	}
+
+	rocr_ops.hsa_shut_down = dlsym(rocr_handle, "hsa_shut_down");
+	if (!rocr_ops.hsa_shut_down) {
+		FT_ERR("Failed to find hsa_shut_down");
+		goto err_dlclose_rocr;
+	}
+
+	rocr_ops.hsa_status_string = dlsym(rocr_handle, "hsa_status_string");
+	if (!rocr_ops.hsa_status_string) {
+		FT_ERR("Failed to find hsa_status_string");
+		goto err_dlclose_rocr;
+	}
+
+	rocr_ops.hsa_agent_get_info = dlsym(rocr_handle, "hsa_agent_get_info");
+	if (!rocr_ops.hsa_agent_get_info) {
+		FT_ERR("Failed to find hsa_agent_get_info");
+		goto err_dlclose_rocr;
+	}
+
+	rocr_ops.hsa_region_get_info = dlsym(rocr_handle,
+					     "hsa_region_get_info");
+	if (!rocr_ops.hsa_region_get_info) {
+		FT_ERR("Failed to find hsa_region_get_info");
+		goto err_dlclose_rocr;
+	}
+
+	rocr_ops.hsa_iterate_agents = dlsym(rocr_handle, "hsa_iterate_agents");
+	if (!rocr_ops.hsa_iterate_agents) {
+		FT_ERR("Failed to find hsa_iterate_agents");
+		goto err_dlclose_rocr;
+	}
+
+	rocr_ops.hsa_agent_iterate_regions =
+		dlsym(rocr_handle, "hsa_agent_iterate_regions");
+	if (!rocr_ops.hsa_agent_iterate_regions) {
+		FT_ERR("Failed to find hsa_agent_iterate_regions");
+		goto err_dlclose_rocr;
+	}
+
+	rocr_ops.hsa_memory_allocate =
+		dlsym(rocr_handle, "hsa_memory_allocate");
+	if (!rocr_ops.hsa_memory_allocate) {
+		FT_ERR("Failed to find hsa_memory_allocate");
+		goto err_dlclose_rocr;
+	}
+
+	rocr_ops.hsa_memory_free = dlsym(rocr_handle, "hsa_memory_free");
+	if (!rocr_ops.hsa_memory_free) {
+		FT_ERR("Failed to find hsa_memory_free");
+		goto err_dlclose_rocr;
+	}
+
+	hsa_ret = rocr_ops.hsa_init();
+	if (hsa_ret != HSA_STATUS_SUCCESS) {
+		ROCR_ERR(hsa_ret, "hsa_init failed");
+		goto err_dlclose_rocr;
+	}
+
+	hsa_ret = rocr_ops.hsa_iterate_agents(agent_cb, NULL);
+	if (hsa_ret != HSA_STATUS_INFO_BREAK) {
+		FT_ERR("Failed to find GPU agent");
+		goto err_dlclose_rocr;
+	}
+
+	hsa_ret = rocr_ops.hsa_agent_iterate_regions(gpu_agent, region_cb,
+						     NULL);
+	if (hsa_ret != HSA_STATUS_INFO_BREAK) {
+		FT_ERR("Failed to find GPU region");
+		goto err_dlclose_rocr;
+	}
+
+	return FI_SUCCESS;
+
+err_dlclose_rocr:
+	dlclose(rocr_handle);
+err:
+	return -FI_ENODATA;
+}
+
+int ft_rocr_cleanup(void)
+{
+	hsa_status_t hsa_ret;
+
+	hsa_ret = rocr_ops.hsa_shut_down();
+	if (hsa_ret != HSA_STATUS_SUCCESS) {
+		ROCR_ERR(hsa_ret, "hsa_init failed");
+		return -FI_ENODATA;
+	}
+
+	dlclose(rocr_handle);
+
+	return FI_SUCCESS;
+}
+
+int ft_rocr_alloc(uint64_t device, void **buf, size_t size)
+{
+	hsa_status_t hsa_ret;
+
+	hsa_ret = rocr_ops.hsa_memory_allocate(gpu_region, size, buf);
+	if (hsa_ret == HSA_STATUS_SUCCESS)
+		return FI_SUCCESS;
+
+	ROCR_ERR(hsa_ret, "hsa_memory_allocate failed");
+
+	return -FI_ENOMEM;
+}
+
+int ft_rocr_free(void *buf)
+{
+	hsa_status_t hsa_ret;
+
+	hsa_ret = rocr_ops.hsa_memory_free(buf);
+	if (hsa_ret == HSA_STATUS_SUCCESS)
+		return FI_SUCCESS;
+
+	ROCR_ERR(hsa_ret, "hsa_memory_free failed");
+
+	return -FI_EIO;
+}
+
+int ft_rocr_memset(uint64_t device, void *buf, int value, size_t size)
+{
+	unsigned char *ptr = buf;
+	unsigned char set_value = value;
+	int ret;
+
+	while (size-- > 0) {
+		ret = ft_rocr_memcpy(device, ptr, &set_value, sizeof(*ptr));
+		if (ret != FI_SUCCESS)
+			return ret;
+
+		ptr++;
+	}
+
+	return FI_SUCCESS;
+}
+
+int ft_rocr_memcpy(uint64_t device, void *dst, const void *src, size_t size)
+{
+	hsa_status_t hsa_ret;
+
+	hsa_ret = rocr_ops.hsa_memory_copy(dst, src, size);
+	if (hsa_ret == HSA_STATUS_SUCCESS)
+		return FI_SUCCESS;
+
+	ROCR_ERR(hsa_ret, "hsa_memory_copy failed");
+
+	return -FI_EIO;
+}
+
+#else
+
+int ft_rocr_init(void)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_rocr_cleanup(void)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_rocr_alloc(uint64_t device, void **buf, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_rocr_free(void *buf)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_rocr_memset(uint64_t device, void *buf, int value, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_rocr_memcpy(uint64_t device, void *dst, const void *src, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+#endif /* HAVE_ROCR_RUNTIME_H */
diff --git a/deps/libfabric/fabtests/common/hmem_ze.c b/deps/libfabric/fabtests/common/hmem_ze.c
new file mode 100644
index 0000000000000000000000000000000000000000..8e9b73f048589e2a4a7b62d1cb57c0d96229baaf
--- /dev/null
+++ b/deps/libfabric/fabtests/common/hmem_ze.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "hmem.h"
+
+#ifdef HAVE_LIBZE
+
+#include <level_zero/ze_api.h>
+
+#define ZE_MAX_DEVICES 4
+
+static ze_context_handle_t context;
+static ze_device_handle_t devices[ZE_MAX_DEVICES];
+static ze_command_queue_handle_t cmd_queue[ZE_MAX_DEVICES];
+static int num_devices = 0;
+
+static const ze_command_queue_desc_t cq_desc = {
+	.stype		= ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+	.pNext		= NULL,
+	.ordinal	= 0,
+	.index		= 0,
+	.flags		= 0,
+	.mode		= ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
+	.priority	= ZE_COMMAND_QUEUE_PRIORITY_NORMAL,
+};
+
+static const ze_command_list_desc_t cl_desc = {
+	.stype				= ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
+	.pNext				= NULL,
+	.commandQueueGroupOrdinal	= 0,
+	.flags				= 0,
+};
+
+static const ze_device_mem_alloc_desc_t device_desc = {
+	.stype		= ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
+	.pNext		= NULL,
+	.flags		= 0,
+	.ordinal	= 0,
+};
+
+static const ze_host_mem_alloc_desc_t host_desc = {
+	.stype		= ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+	.pNext		= NULL,
+	.flags		= 0,
+};
+
+int ft_ze_init(void)
+{
+	ze_driver_handle_t driver;
+	ze_context_desc_t context_desc = {0};
+	ze_result_t ze_ret;
+	uint32_t count;
+
+	ze_ret = zeInit(ZE_INIT_FLAG_GPU_ONLY);
+	if (ze_ret)
+		return -FI_EIO;
+
+	count = 1;
+	ze_ret = zeDriverGet(&count, &driver);
+	if (ze_ret)
+		return -FI_EIO;
+
+	ze_ret = zeContextCreate(driver, &context_desc, &context);
+	if (ze_ret)
+		return -FI_EIO;
+
+	count = 0;
+	ze_ret = zeDeviceGet(driver, &count, NULL);
+	if (ze_ret || count > ZE_MAX_DEVICES)
+		goto err;;
+
+	ze_ret = zeDeviceGet(driver, &count, devices);
+	if (ze_ret)
+		goto err;
+
+	for (num_devices = 0; num_devices < count; num_devices++) {
+		ze_ret = zeCommandQueueCreate(context, devices[num_devices], &cq_desc,
+					      &cmd_queue[num_devices]);
+		if (ze_ret)
+			goto err;
+	}
+
+	return FI_SUCCESS;
+
+err:
+	(void) ft_ze_cleanup();
+	return -FI_EIO;
+}
+
+int ft_ze_cleanup(void)
+{
+	int i, ret = FI_SUCCESS;
+
+	for (i = 0; i < num_devices; i++) {
+		if (cmd_queue[i] && zeCommandQueueDestroy(cmd_queue[i]))
+			ret = -FI_EINVAL;
+	}
+
+	if (zeContextDestroy(context))
+		return -FI_EINVAL;
+
+	return ret;
+}
+
+int ft_ze_alloc(uint64_t device, void **buf, size_t size)
+{
+	return zeMemAllocShared(context, &device_desc, &host_desc,
+				size, 16, devices[device], buf) ?
+				-FI_EINVAL : 0;
+}
+
+int ft_ze_free(void *buf)
+{
+	return zeMemFree(context, buf) ? -FI_EINVAL : FI_SUCCESS;
+}
+
+int ft_ze_memset(uint64_t device, void *buf, int value, size_t size)
+{
+	ze_command_list_handle_t cmd_list;
+	ze_result_t ze_ret;
+
+	ze_ret = zeCommandListCreate(context, devices[device], &cl_desc, &cmd_list);
+	if (ze_ret)
+		return -FI_EIO;
+
+	ze_ret = zeCommandListAppendMemoryFill(cmd_list, buf, &value,
+					       sizeof(value), size, NULL, 0, NULL);
+	if (ze_ret)
+		goto free;
+
+	ze_ret = zeCommandListClose(cmd_list);
+	if (ze_ret)
+		goto free;
+
+	ze_ret = zeCommandQueueExecuteCommandLists(cmd_queue[device], 1,
+						   &cmd_list, NULL);
+
+free:
+	if (!zeCommandListDestroy(cmd_list) && !ze_ret)
+		return FI_SUCCESS;
+
+	return -FI_EINVAL;
+}
+
+int ft_ze_copy(uint64_t device, void *dst, const void *src, size_t size)
+{
+	ze_command_list_handle_t cmd_list;
+	ze_result_t ze_ret;
+
+	ze_ret = zeCommandListCreate(context, devices[device], &cl_desc, &cmd_list);
+	if (ze_ret)
+		return -FI_EIO;
+
+	ze_ret = zeCommandListAppendMemoryCopy(cmd_list, dst, src, size, NULL, 0, NULL);
+	if (ze_ret)
+		goto free;
+
+	ze_ret = zeCommandListClose(cmd_list);
+	if (ze_ret)
+		goto free;
+
+	ze_ret = zeCommandQueueExecuteCommandLists(cmd_queue[device], 1,
+						   &cmd_list, NULL);
+
+free:
+	if (!zeCommandListDestroy(cmd_list) && !ze_ret)
+		return FI_SUCCESS;
+
+	return -FI_EINVAL;
+}
+
+#else
+
+int ft_ze_init(void)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_ze_cleanup(void)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_ze_alloc(uint64_t device, void **buf, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_ze_free(void *buf)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_ze_memset(uint64_t device, void *buf, int value, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+int ft_ze_copy(uint64_t device, void *dst, const void *src, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+
+#endif /* HAVE_LIBZE */
diff --git a/deps/libfabric/fabtests/common/shared.c b/deps/libfabric/fabtests/common/shared.c
index 010a113ae98bfd43b1a2dd68e95cfdb3ab22e04b..95737b64094d8c276e25e4f670ae49f7f557d72d 100644
--- a/deps/libfabric/fabtests/common/shared.c
+++ b/deps/libfabric/fabtests/common/shared.c
@@ -49,6 +49,7 @@
 #include <rdma/fi_collective.h>
 
 #include <shared.h>
+#include <hmem.h>
 
 struct fi_info *fi_pep, *fi, *hints;
 struct fid_fabric *fabric;
@@ -358,6 +359,48 @@ void ft_free_bit_combo(uint64_t *combo)
 	free(combo);
 }
 
+static int ft_reg_mr(void *buf, size_t size, uint64_t access,
+		     uint64_t key, struct fid_mr **mr, void **desc)
+{
+	struct fi_mr_attr attr = {0};
+	struct iovec iov = {0};
+	int ret;
+
+	if (((!(fi->domain_attr->mr_mode & FI_MR_LOCAL) &&
+	      !(opts.options & FT_OPT_USE_DEVICE)) ||
+	     (!(fi->domain_attr->mr_mode & FI_MR_HMEM) &&
+	      opts.options & FT_OPT_USE_DEVICE)) &&
+	    !(fi->caps & (FI_RMA | FI_ATOMIC)))
+		return 0;
+
+	iov.iov_base = buf;
+	iov.iov_len = size;
+	attr.mr_iov = &iov;
+	attr.iov_count = 1;
+	attr.access = access;
+	attr.offset = 0;
+	attr.requested_key = key;
+	attr.context = NULL;
+	attr.iface = opts.iface;
+
+	switch (opts.iface) {
+	case FI_HMEM_ZE:
+		attr.device.ze = opts.device;
+		break;
+	default:
+		break;
+	}
+
+	ret = fi_mr_regattr(domain, &attr, 0, mr);
+	if (ret)
+		return ret;
+
+	if (desc)
+		*desc = fi_mr_desc(*mr);
+
+	return FI_SUCCESS;
+}
+
 static int ft_alloc_ctx_array(struct ft_context **mr_array, char ***mr_bufs,
 			      char *default_buf, size_t mr_size,
 			      uint64_t start_key)
@@ -380,24 +423,22 @@ static int ft_alloc_ctx_array(struct ft_context **mr_array, char ***mr_bufs,
 		context = &(*mr_array)[i];
 		if (!(opts.options & FT_OPT_ALLOC_MULT_MR)) {
 			context->buf = default_buf + mr_size * i;
+			context->mr = mr;
+			context->desc = mr_desc;
 			continue;
 		}
-		(*mr_bufs)[i] = calloc(1, mr_size);
+		ret = ft_hmem_alloc(opts.iface, opts.device,
+				    (void **) &((*mr_bufs)[i]), mr_size);
+		if (ret)
+			return ret;
+
 		context->buf = (*mr_bufs)[i];
-    		if (((fi->domain_attr->mr_mode & FI_MR_LOCAL) ||
-		     (fi->caps & (FI_RMA | FI_ATOMIC)))) {
-			ret = fi_mr_reg(domain, context->buf,
-					mr_size, access, 0,
-					start_key + i, 0,
-					&context->mr, NULL);
-			if (ret)
-				return ret;
 
-			context->desc = fi_mr_desc(context->mr);
-		} else {
-			context->mr =  NULL;
-			context->desc = NULL;
-		}
+		ret = ft_reg_mr(context->buf, mr_size, access,
+				start_key + i, &context->mr,
+				&context->desc);
+		if (ret)
+			return ret;
 	}
 
 	return 0;
@@ -439,7 +480,7 @@ static int ft_alloc_msgs(void)
 			   MAX(rx_size, FT_MAX_CTRL_MSG) * opts.window_size;
 	}
 
-	if (opts.options & FT_OPT_ALIGN) {
+	if (opts.options & FT_OPT_ALIGN && !(opts.options & FT_OPT_USE_DEVICE)) {
 		alignment = sysconf(_SC_PAGESIZE);
 		if (alignment < 0)
 			return -errno;
@@ -452,13 +493,13 @@ static int ft_alloc_msgs(void)
 			return ret;
 		}
 	} else {
-		buf = malloc(buf_size);
-		if (!buf) {
-			perror("malloc");
-			return -FI_ENOMEM;
-		}
+		ret = ft_hmem_alloc(opts.iface, opts.device, (void **) &buf, buf_size);
+		if (ret)
+			return ret;
 	}
-	memset(buf, 0, buf_size);
+	ret = ft_hmem_memset(opts.iface, opts.device, (void *) buf, 0, buf_size);
+	if (ret)
+		return ret;
 	rx_buf = buf;
 
 	if (opts.options & FT_OPT_ALLOC_MULT_MR)
@@ -468,16 +509,12 @@ static int ft_alloc_msgs(void)
 
 	remote_cq_data = ft_init_cq_data(fi);
 
-	if (!ft_mr_alloc_func && !ft_check_opts(FT_OPT_SKIP_REG_MR) &&
-	    ((fi->domain_attr->mr_mode & FI_MR_LOCAL) ||
-	     (fi->caps & (FI_RMA | FI_ATOMIC)))) {
-		ret = fi_mr_reg(domain, buf, buf_size, ft_info_to_mr_access(fi),
-				0, FT_MR_KEY, 0, &mr, NULL);
-		if (ret) {
-			FT_PRINTERR("fi_mr_reg", ret);
+	mr = &no_mr;
+	if (!ft_mr_alloc_func && !ft_check_opts(FT_OPT_SKIP_REG_MR)) {
+		ret = ft_reg_mr(buf, buf_size, ft_info_to_mr_access(fi),
+				FT_MR_KEY, &mr, &mr_desc);
+		if (ret)
 			return ret;
-		}
-		mr_desc = ft_check_mr_local_flag(fi) ? fi_mr_desc(mr) : NULL;
 	} else {
 		if (ft_mr_alloc_func) {
 			assert(!ft_check_opts(FT_OPT_SKIP_REG_MR));
@@ -485,7 +522,6 @@ static int ft_alloc_msgs(void)
 			if (ret)
 				return ret;
 		}
-		mr = &no_mr;
 	}
 
 	ret = ft_alloc_ctx_array(&tx_ctx_arr, &tx_mr_bufs, tx_buf,
@@ -640,12 +676,20 @@ int ft_alloc_active_res(struct fi_info *fi)
 	return 0;
 }
 
-static void ft_init(void)
+static int ft_init(void)
 {
 	tx_seq = 0;
 	rx_seq = 0;
 	tx_cq_cntr = 0;
 	rx_cq_cntr = 0;
+
+	//If using device memory for transfers, require OOB address
+	//exchange because extra steps are involved when passing
+	//device buffers into fi_av_insert
+	if (opts.options & FT_OPT_ENABLE_HMEM)
+		opts.options |= FT_OPT_OOB_ADDR_EXCH;
+
+	return ft_hmem_init(opts.iface);
 }
 
 int ft_init_oob(void)
@@ -734,6 +778,11 @@ int ft_getinfo(struct fi_info *hints, struct fi_info **info)
 	if (!hints->ep_attr->type)
 		hints->ep_attr->type = FI_EP_RDM;
 
+	if (opts.options & FT_OPT_ENABLE_HMEM) {
+		hints->caps |= FI_HMEM;
+		hints->domain_attr->mr_mode |= FI_MR_HMEM;
+	}
+
 	ret = fi_getinfo(FT_FIVERSION, node, service, flags, hints, info);
 	if (ret) {
 		FT_PRINTERR("fi_getinfo", ret);
@@ -766,7 +815,10 @@ int ft_start_server(void)
 {
 	int ret;
 
-	ft_init();
+	ret = ft_init();
+	if (ret)
+		return ret;
+
 	ret = ft_init_oob();
 	if (ret)
 		return ret;
@@ -928,7 +980,10 @@ int ft_client_connect(void)
 {
 	int ret;
 
-	ft_init();
+	ret = ft_init();
+	if (ret)
+		return ret;
+
 	ret = ft_init_oob();
 	if (ret)
 		return ret;
@@ -960,7 +1015,10 @@ int ft_init_fabric(void)
 {
 	int ret;
 
-	ft_init();
+	ret = ft_init();
+	if (ret)
+		return ret;
+
 	ret = ft_init_oob();
 	if (ret)
 		return ret;
@@ -1447,14 +1505,16 @@ int ft_exchange_keys(struct fi_rma_iov *peer_iov)
 
 static void ft_cleanup_mr_array(struct ft_context *ctx_arr, char **mr_bufs)
 {
-	int i;
+	int i, ret;
 
 	if (!mr_bufs)
 		return;
 
 	for (i = 0; i < opts.window_size; i++) {
 		FT_CLOSE_FID(ctx_arr[i].mr);
-		free(mr_bufs[i]);
+		ret = ft_hmem_free(opts.iface, mr_bufs[i]);
+		if (ret)
+			FT_PRINTERR("ft_hmem_free", ret);
 	}
 }
 
@@ -1484,6 +1544,8 @@ static void ft_close_fids(void)
 
 void ft_free_res(void)
 {
+	int ret;
+
 	ft_cleanup_mr_array(tx_ctx_arr, tx_mr_bufs);
 	ft_cleanup_mr_array(rx_ctx_arr, rx_mr_bufs);
 
@@ -1495,7 +1557,9 @@ void ft_free_res(void)
 	ft_close_fids();
 
 	if (buf) {
-		free(buf);
+		ret = ft_hmem_free(opts.iface, buf);
+		if (ret)
+			FT_PRINTERR("ft_hmem_free", ret);
 		buf = rx_buf = tx_buf = NULL;
 		buf_size = rx_size = tx_size = tx_mr_size = rx_mr_size = 0;
 	}
@@ -1511,6 +1575,10 @@ void ft_free_res(void)
 		fi_freeinfo(hints);
 		hints = NULL;
 	}
+	
+	ret = ft_hmem_cleanup(opts.iface);
+	if (ret)
+		FT_PRINTERR("ft_hmem_cleanup", ret);
 }
 
 static int dupaddr(void **dst_addr, size_t *dst_addrlen,
@@ -2554,7 +2622,6 @@ int ft_finalize_ep(struct fid_ep *ep)
 	int ret;
 	struct fi_context ctx;
 
-	strcpy(tx_buf + ft_tx_prefix_size(), "fin");
 	iov.iov_base = tx_buf;
 	iov.iov_len = 4 + ft_tx_prefix_size();
 
@@ -2706,6 +2773,7 @@ void ft_addr_usage()
 			"over the, optional, port");
 	FT_PRINT_OPTS_USAGE("-C <number>", "number of connections to accept before "
 			"cleaning up a server");
+	FT_PRINT_OPTS_USAGE("-F <addr_format>", "Address format (default:FI_FORMAT_UNSPEC)");
 }
 
 void ft_usage(char *name, char *desc)
@@ -2757,6 +2825,10 @@ void ft_mcusage(char *name, char *desc)
 	FT_PRINT_OPTS_USAGE("-p <provider>", "specific provider name eg sockets, verbs");
 	FT_PRINT_OPTS_USAGE("-d <domain>", "domain name");
 	FT_PRINT_OPTS_USAGE("-p <provider>", "specific provider name eg sockets, verbs");
+	FT_PRINT_OPTS_USAGE("-D <device_iface>", "Specify device interface: eg ze (default: None). "
+			     "Automatically enables FI_HMEM (-H)");
+	FT_PRINT_OPTS_USAGE("-i <device_id>", "Specify which device to use (default: 0)");
+	FT_PRINT_OPTS_USAGE("-H", "Enable provider FI_HMEM support");
 	FT_PRINT_OPTS_USAGE("-h", "display this help output");
 
 	return;
@@ -2823,6 +2895,19 @@ void ft_parseinfo(int op, char *optarg, struct fi_info *hints,
 		if (!strncasecmp("mr_local", optarg, 8))
 			opts->mr_mode &= ~FI_MR_LOCAL;
 		break;
+	case 'D':
+		if (!strncasecmp("ze", optarg, 2))
+			opts->iface = FI_HMEM_ZE;
+		else
+			printf("Unsupported interface\n");
+		opts->options |= FT_OPT_ENABLE_HMEM | FT_OPT_USE_DEVICE;
+		break;
+	case 'i':
+		opts->device = atoi(optarg);
+		break;
+	case 'H':
+		opts->options |= FT_OPT_ENABLE_HMEM;
+		break;
 	default:
 		/* let getopt handle unknown opts*/
 		break;
@@ -2852,6 +2937,16 @@ void ft_parse_addr_opts(int op, char *optarg, struct ft_opts *opts)
 		else
 			opts->oob_port = default_oob_port;
 		break;
+	case 'F':
+		if (!strncasecmp("fi_sockaddr_in", optarg, 14))
+			opts->address_format = FI_SOCKADDR_IN;
+		else if (!strncasecmp("fi_sockaddr_in6", optarg, 15))
+			opts->address_format = FI_SOCKADDR_IN6;
+		else if (!strncasecmp("fi_sockaddr_ib", optarg, 14))
+			opts->address_format = FI_SOCKADDR_IB;
+		else if (!strncasecmp("fi_sockaddr", optarg, 11)) /* keep me last */
+			opts->address_format = FI_SOCKADDR;
+		break;
 	case 'C':
 		opts->options |= FT_OPT_SERVER_PERSIST;
 		opts->num_connections = atoi(optarg);
@@ -2875,7 +2970,7 @@ void ft_parsecsopts(int op, char *optarg, struct ft_opts *opts)
 			opts->sizes_enabled = FT_ENABLE_ALL;
 		} else {
 			opts->options |= FT_OPT_SIZE;
-			opts->transfer_size = atoi(optarg);
+			opts->transfer_size = atol(optarg);
 		}
 		break;
 	case 'm':
@@ -2941,12 +3036,12 @@ int ft_parse_rma_opts(int op, char *optarg, struct fi_info *hints,
 	return 0;
 }
 
-void ft_fill_buf(void *buf, int size)
+void ft_fill_buf(void *buf, size_t size)
 {
 	char *msg_buf;
 	int msg_index;
 	static unsigned int iter = 0;
-	int i;
+	size_t i;
 
 	msg_index = ((iter++)*INTEG_SEED) % integ_alphabet_length;
 	msg_buf = (char *)buf;
@@ -2957,13 +3052,13 @@ void ft_fill_buf(void *buf, int size)
 	}
 }
 
-int ft_check_buf(void *buf, int size)
+int ft_check_buf(void *buf, size_t size)
 {
 	char *recv_data;
 	char c;
 	static unsigned int iter = 0;
 	int msg_index;
-	int i;
+	size_t i;
 
 	msg_index = ((iter++)*INTEG_SEED) % integ_alphabet_length;
 	recv_data = (char *)buf;
@@ -2976,7 +3071,7 @@ int ft_check_buf(void *buf, int size)
 			break;
 	}
 	if (i != size) {
-		printf("Error at iteration=%d size=%d byte=%d\n",
+		printf("Error at iteration=%d size=%zu byte=%zu\n",
 			iter, size, i);
 		return 1;
 	}
diff --git a/deps/libfabric/fabtests/configure.ac b/deps/libfabric/fabtests/configure.ac
index 24f290274c8d1e91be8aa2dfda39759ed7cce304..eeea4baffe9086955d5dce08b2f2b3a9560a27b0 100644
--- a/deps/libfabric/fabtests/configure.ac
+++ b/deps/libfabric/fabtests/configure.ac
@@ -5,7 +5,7 @@ dnl
 dnl Process this file with autoconf to produce a configure script.
 
 AC_PREREQ(2.57)
-AC_INIT([fabtests], [1.10.1], [ofiwg@lists.openfabrics.org])
+AC_INIT([fabtests], [1.11.1], [ofiwg@lists.openfabrics.org])
 AC_CONFIG_AUX_DIR(config)
 AC_CONFIG_MACRO_DIR(config)
 AC_CONFIG_HEADERS(config.h)
@@ -96,6 +96,36 @@ AC_ARG_WITH([libfabric],
              LDFLAGS="-L$withval/$fab_libdir $LDFLAGS"],
             [])
 
+dnl Check for CUDA support. Require fabtests to dlopen CUDA runtime.
+AC_ARG_WITH([cuda],
+            [AC_HELP_STRING([--with-cuda=DIR],
+                            [Provide path to where the CUDA development
+                             and runtime libraries are installed.])],
+            [AS_IF([test "$freebsd" == "0"],
+                   [AC_CHECK_LIB(dl, dlopen, [], [AC_MSG_ERROR([dlopen not found.])])],
+                   [])
+             CPPFLAGS="-I$withval/include $CPPFLAGS"
+             AC_CHECK_HEADER([cuda_runtime.h],
+                             [AC_DEFINE([HAVE_CUDA_RUNTIME_H], [1],
+                                        [Define to 1 if you have <cuda_runtime.h>])],
+                             [AC_MSG_ERROR([<cuda_runtime.h> not found])])],
+            [])
+
+dnl Check for ROCR support. Require fabtests to dlopen ROCR.
+AC_ARG_WITH([rocr],
+            [AC_HELP_STRING([--with-rocr=DIR],
+                            [Provide path to where the ROCR development
+                             and runtime libraries are installed.])],
+            [AS_IF([test "$freebsd" == "0"],
+                   [AC_CHECK_LIB(dl, dlopen, [], [AC_MSG_ERROR([dlopen not found.])])],
+                   [])
+             CPPFLAGS="-I$withval/include $CPPFLAGS"
+             AC_CHECK_HEADER([hsa/hsa.h],
+                             [AC_DEFINE([HAVE_ROCR_RUNTIME_H], [1],
+                                        [Define to 1 if you have <hsa/hsa.h>])],
+                             [AC_MSG_ERROR([<hsa/hsa.h> not found])])],
+            [])
+
 dnl Checks for libraries
 AC_CHECK_LIB([fabric], fi_getinfo, [],
     AC_MSG_ERROR([fi_getinfo() not found.  fabtests requires libfabric.]))
@@ -105,6 +135,20 @@ AC_HEADER_STDC
 AC_CHECK_HEADER([rdma/fabric.h], [],
     [AC_MSG_ERROR([<rdma/fabric.h> not found.  fabtests requires libfabric.])])
 
+AC_ARG_WITH([ze],
+            AC_HELP_STRING([--with-ze], [Use non-default ZE location - default NO]),
+            [CPPFLAGS="-I$withval/include $CPPFLAGS"
+             LDFLAGS="-L$withval/$lib $LDFLAGS"],
+            [])
+
+dnl Checks for ZE libraries
+AS_IF([test x"$with_ze" != x"no"],
+      [AC_CHECK_LIB([ze_loader], zeInit,
+       AC_CHECK_HEADER([level_zero/ze_api.h],
+			AC_DEFINE([HAVE_LIBZE], 1, [ZE support])),
+			[])]
+      [])
+
 AC_MSG_CHECKING([for fi_trywait support])
 AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <rdma/fi_eq.h>]],
 	       [[fi_trywait(NULL, NULL, 0);]])],
diff --git a/deps/libfabric/fabtests/fabtests.vcxproj b/deps/libfabric/fabtests/fabtests.vcxproj
index d6f06bf49623dbb679c8ebe71dd75cabd00ceeeb..ad3cd3e3e9bc085fa0f2eee1041977af811e0d53 100644
--- a/deps/libfabric/fabtests/fabtests.vcxproj
+++ b/deps/libfabric/fabtests/fabtests.vcxproj
@@ -108,6 +108,10 @@
     <ClCompile Include="benchmarks\rdm_tagged_pingpong.c" />
     <ClCompile Include="benchmarks\rma_bw.c" />
     <ClCompile Include="common\jsmn.c" />
+    <ClCompile Include="common\hmem.c" />
+    <ClCompile Include="common\hmem_cuda.c" />
+    <ClCompile Include="common\hmem_rocr.c" />
+    <ClCompile Include="common\hmem_ze.c" />
     <ClCompile Include="common\shared.c" />
     <ClCompile Include="common\windows\getopt.c" />
     <ClCompile Include="common\windows\osd.c" />
@@ -150,6 +154,7 @@
     <ClInclude Include="ubertest\fabtest.h" />
     <ClInclude Include="include\ft_osd.h" />
     <ClInclude Include="include\jsmn.h" />
+    <ClInclude Include="include\hmem.h" />
     <ClInclude Include="include\shared.h" />
     <ClInclude Include="include\unit_common.h" />
     <ClInclude Include="include\windows\getopt\getopt.h" />
diff --git a/deps/libfabric/fabtests/fabtests.vcxproj.filters b/deps/libfabric/fabtests/fabtests.vcxproj.filters
index 2370a8e239debbebcbb19fc05e0fcf1e4216a21e..571896ccee19f9a2ef12eb74fd13474be80af5ee 100644
--- a/deps/libfabric/fabtests/fabtests.vcxproj.filters
+++ b/deps/libfabric/fabtests/fabtests.vcxproj.filters
@@ -48,6 +48,18 @@
     <ClCompile Include="common\jsmn.c">
       <Filter>Source Files\common</Filter>
     </ClCompile>
+    <ClCompile Include="common\hmem.c">
+      <Filter>Source Files\common</Filter>
+    </ClCompile>
+    <ClCompile Include="common\hmem_cuda.c">
+      <Filter>Source Files\common</Filter>
+    </ClCompile>
+    <ClCompile Include="common\hmem_rocr.c">
+      <Filter>Source Files\common</Filter>
+    </ClCompile>
+    <ClCompile Include="common\hmem_ze.c">
+      <Filter>Source Files\common</Filter>
+    </ClCompile>
     <ClCompile Include="common\shared.c">
       <Filter>Source Files\common</Filter>
     </ClCompile>
@@ -185,6 +197,9 @@
     <ClInclude Include="include\jsmn.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="include\hmem.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="include\shared.h">
       <Filter>Header Files</Filter>
     </ClInclude>
diff --git a/deps/libfabric/fabtests/functional/recv_cancel.c b/deps/libfabric/fabtests/functional/recv_cancel.c
index e75569471ca3d459e332945301c8d611cbf70fd6..35bcd500b4fa80b63ce179357efedf811f2dc3a5 100644
--- a/deps/libfabric/fabtests/functional/recv_cancel.c
+++ b/deps/libfabric/fabtests/functional/recv_cancel.c
@@ -161,6 +161,28 @@ static int recv_cancel_host(void)
 	if (opts.verbose)
 		fprintf(stdout, "GOOD: Completed uncancelled recv\n");
 
+	/* Repost cancelled recv and get completion */
+	ft_tag = CANCEL_TAG;
+	ret = ft_post_rx(ep, opts.transfer_size, &cancel_recv_ctx);
+	if (ret)
+		return ret;
+
+	do {
+		ret = fi_cq_read(rxcq, &recv_completion, 1);
+		if (ret > 0) {
+			if (recv_completion.op_context != &cancel_recv_ctx) {
+				FT_PRINTERR("ERROR: op_context does not match",
+					    -FI_EOTHER);
+				return -FI_EOTHER;
+			}
+		} else if ((ret <= 0) && (ret != -FI_EAGAIN)) {
+			FT_PRINTERR("fi_cq_read", ret);
+		}
+	} while (ret == -FI_EAGAIN);
+
+	if (opts.verbose)
+		fprintf(stdout, "GOOD: Completed reposted cancelled recv\n");
+
 	fprintf(stdout, "GOOD: Completed Recv Cancel Test\n");
 
 	return 0;
diff --git a/deps/libfabric/fabtests/include/hmem.h b/deps/libfabric/fabtests/include/hmem.h
new file mode 100644
index 0000000000000000000000000000000000000000..c12fff7c03a246705a4f18a0dbac0589f7f72b3a
--- /dev/null
+++ b/deps/libfabric/fabtests/include/hmem.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2020 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under the BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ */
+
+#ifndef _HMEM_H_
+#define _HMEM_H_
+#if HAVE_CONFIG_H
+	#include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+#include <rdma/fi_domain.h>
+#include <rdma/fi_errno.h>
+
+int ft_ze_init(void);
+int ft_ze_cleanup(void);
+int ft_ze_alloc(uint64_t device, void **buf, size_t size);
+int ft_ze_free(void *buf);
+int ft_ze_memset(uint64_t device, void *buf, int value, size_t size);
+int ft_ze_copy(uint64_t device, void *dst, const void *src, size_t size);
+
+static inline int ft_host_init()
+{
+	return FI_SUCCESS;
+}
+
+static inline int ft_host_cleanup()
+{
+	return FI_SUCCESS;
+}
+
+static inline int ft_host_alloc(uint64_t device, void **buffer, size_t size)
+{
+	*buffer = malloc(size);
+	return !*buffer ? -FI_ENOMEM : FI_SUCCESS;
+}
+
+static inline int ft_host_free(void *buf)
+{
+	free(buf);
+	return FI_SUCCESS;
+}
+
+static inline int ft_host_memset(uint64_t device, void *buf, int value,
+				 size_t size)
+{
+	memset(buf, value, size);
+	return FI_SUCCESS;
+}
+
+static inline int ft_host_memcpy(uint64_t device, void *dst, const void *src,
+				 size_t size)
+{
+	memcpy(dst, src, size);
+	return FI_SUCCESS;
+}
+
+int ft_cuda_init(void);
+int ft_cuda_cleanup(void);
+int ft_cuda_alloc(uint64_t device, void **buf, size_t size);
+int ft_cuda_free(void *buf);
+int ft_cuda_memset(uint64_t device, void *buf, int value, size_t size);
+int ft_cuda_copy_to_hmem(uint64_t device, void *dst, const void *src,
+			 size_t size);
+int ft_cuda_copy_from_hmem(uint64_t device, void *dst, const void *src,
+			   size_t size);
+
+int ft_rocr_init(void);
+int ft_rocr_cleanup(void);
+int ft_rocr_alloc(uint64_t device, void **buf, size_t size);
+int ft_rocr_free(void *buf);
+int ft_rocr_memset(uint64_t device, void *buf, int value, size_t size);
+int ft_rocr_memcpy(uint64_t device, void *dst, const void *src, size_t size);
+
+int ft_hmem_init(enum fi_hmem_iface iface);
+int ft_hmem_cleanup(enum fi_hmem_iface iface);
+int ft_hmem_alloc(enum fi_hmem_iface iface, uint64_t device, void **buf,
+		  size_t size);
+int ft_hmem_free(enum fi_hmem_iface iface, void *buf);
+int ft_hmem_memset(enum fi_hmem_iface iface, uint64_t device, void *buf,
+		   int value, size_t size);
+int ft_hmem_copy_to(enum fi_hmem_iface iface, uint64_t device, void *dst,
+		    const void *src, size_t size);
+int ft_hmem_copy_from(enum fi_hmem_iface iface, uint64_t device, void *dst,
+		      const void *src, size_t size);
+
+#endif /* _HMEM_H_ */
diff --git a/deps/libfabric/fabtests/include/osx/malloc.h b/deps/libfabric/fabtests/include/osx/malloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7a9369c904b548d97a1b38d1f8b412544e2d529
--- /dev/null
+++ b/deps/libfabric/fabtests/include/osx/malloc.h
@@ -0,0 +1,44 @@
+/*
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FABTESTS_OSX_MALLOC_H_
+#define _FABTESTS_OSX_MALLOC_H_
+
+#define M_MMAP_THRESHOLD    -3
+
+int mallopt(int param, int value)
+{
+	/* Not supported. */
+	return 0;
+}
+
+#endif /* _FABTESTS_OSX_MALLOC_H_ */
diff --git a/deps/libfabric/fabtests/include/shared.h b/deps/libfabric/fabtests/include/shared.h
index c84e1e24b9264024862dcaf1683317ce2e30cd39..709537924aa90a15505f1c2ad5b2c322dba7aace 100644
--- a/deps/libfabric/fabtests/include/shared.h
+++ b/deps/libfabric/fabtests/include/shared.h
@@ -111,6 +111,8 @@ enum {
 	FT_OPT_OOB_ADDR_EXCH	= 1 << 14,
 	FT_OPT_ALLOC_MULT_MR	= 1 << 15,
 	FT_OPT_SERVER_PERSIST	= 1 << 16,
+	FT_OPT_ENABLE_HMEM	= 1 << 17,
+	FT_OPT_USE_DEVICE	= 1 << 18,
 	FT_OPT_OOB_CTRL		= FT_OPT_OOB_SYNC | FT_OPT_OOB_ADDR_EXCH,
 };
 
@@ -163,10 +165,14 @@ struct ft_opts {
 	char *oob_port;
 	int argc;
 	int num_connections;
+	int address_format;
 
 	uint64_t mr_mode;
 	/* Fail if the selected provider does not support FI_MSG_PREFIX.  */
 	int force_prefix;
+	enum fi_hmem_iface iface;
+	uint64_t device;
+
 	char **argv;
 };
 
@@ -219,8 +225,8 @@ void ft_usage(char *name, char *desc);
 void ft_mcusage(char *name, char *desc);
 void ft_csusage(char *name, char *desc);
 
-void ft_fill_buf(void *buf, int size);
-int ft_check_buf(void *buf, int size);
+void ft_fill_buf(void *buf, size_t size);
+int ft_check_buf(void *buf, size_t size);
 int ft_check_opts(uint64_t flags);
 uint64_t ft_init_cq_data(struct fi_info *info);
 int ft_sock_listen(char *node, char *service);
@@ -236,8 +242,8 @@ extern int ft_parent_proc;
 extern int ft_socket_pair[2];
 extern int sock;
 extern int listen_sock;
-#define ADDR_OPTS "B:P:s:a:b::E::C:"
-#define FAB_OPTS "f:d:p:"
+#define ADDR_OPTS "B:P:s:a:b::E::C:F:"
+#define FAB_OPTS "f:d:p:D:i:H"
 #define INFO_OPTS FAB_OPTS "e:M:"
 #define CS_OPTS ADDR_OPTS "I:S:mc:t:w:l"
 #define NO_CQ_DATA 0
@@ -258,7 +264,10 @@ extern char default_port[8];
 		.rma_op = FT_RMA_WRITE, \
 		.oob_port = NULL, \
 		.mr_mode = FI_MR_LOCAL | OFI_MR_BASIC_MAP, \
-		.argc = argc, .argv = argv \
+		.iface = FI_HMEM_SYSTEM, \
+		.device = 0, \
+		.argc = argc, .argv = argv, \
+		.address_format = FI_FORMAT_UNSPEC \
 	}
 
 #define FT_STR_LEN 32
diff --git a/deps/libfabric/fabtests/man/fabtests.7.md b/deps/libfabric/fabtests/man/fabtests.7.md
index 6d7f00fb7249ab3255915d4ff57efce1b0c67e02..f64b2353900fcba58236ab5e16b1705ab048cb4d 100644
--- a/deps/libfabric/fabtests/man/fabtests.7.md
+++ b/deps/libfabric/fabtests/man/fabtests.7.md
@@ -209,6 +209,9 @@ testing scope is limited.
 *fi_mr_test*
 : Tests memory registration.
 
+*fi_mr_cache_evict*
+: Tests provider MR cache eviction capabilities.
+
 *fi_resource_freeing*
 : Allocates and closes fabric resources to check for proper cleanup.
 
@@ -379,6 +382,9 @@ the list available for that test.
 *-s <address>*
 : Specifies the address of the local endpoint.
 
+*-F <address_format>
+: Specifies the address format.
+
 *-b[=oob_port]*
 : Enables out-of-band (via sockets) address exchange and test
   synchronization.  A port for the out-of-band connection may be specified
diff --git a/deps/libfabric/fabtests/man/man7/fabtests.7 b/deps/libfabric/fabtests/man/man7/fabtests.7
index 59c296df5a79bb9ae252a089c6bf7db7e3d38048..25644a35e7c296f0b129b93ce64fe2b18b590534 100644
--- a/deps/libfabric/fabtests/man/man7/fabtests.7
+++ b/deps/libfabric/fabtests/man/man7/fabtests.7
@@ -1,6 +1,6 @@
 .\" Automatically generated by Pandoc 1.19.2.4
 .\"
-.TH "fabtests" "7" "2020\-03\-02" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fabtests" "7" "2020\-07\-27" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
 .hy
 .SH NAME
 .PP
@@ -303,6 +303,11 @@ Tests memory registration.
 .RS
 .RE
 .TP
+.B \f[I]fi_mr_cache_evict\f[]
+Tests provider MR cache eviction capabilities.
+.RS
+.RE
+.TP
 .B \f[I]fi_resource_freeing\f[]
 Allocates and closes fabric resources to check for proper cleanup.
 .RS
@@ -545,6 +550,11 @@ Specifies the port number of the peer endpoint, overriding the default.
 .IP \[bu] 2
 : Specifies the address of the local endpoint.
 .TP
+.B *\-F 
+Specifies the address format.
+.RS
+.RE
+.TP
 .B \f[I]\-b[=oob_port]\f[]
 Enables out\-of\-band (via sockets) address exchange and test
 synchronization.
diff --git a/deps/libfabric/fabtests/scripts/runfabtests.sh b/deps/libfabric/fabtests/scripts/runfabtests.sh
index f468c9bc274f93c5dd22386d453e7cdb4d123419..6f74d53ab926e2cd07eb3260529efc5ae16161f8 100755
--- a/deps/libfabric/fabtests/scripts/runfabtests.sh
+++ b/deps/libfabric/fabtests/scripts/runfabtests.sh
@@ -74,12 +74,19 @@ declare -i pass_count=0
 declare -i fail_count=0
 declare -i total_failures=0
 
+python=$(which python3 2>/dev/null) || python=$(which python2 2>/dev/null) || python=$(which python 2>/dev/null)
+
+if [ $? -ne 0 ]; then
+	echo "Unable to find python dependency, exiting..."
+	exit 1
+fi
+
 if [[ "$(uname)" == "FreeBSD" ]]; then
-    declare -ri FI_ENODATA=$(python -c 'import errno; print(errno.ENOMSG)')
+    declare -ri FI_ENODATA=$($python -c 'import errno; print(errno.ENOMSG)')
 else
-    declare -ri FI_ENODATA=$(python -c 'import errno; print(errno.ENODATA)')
+    declare -ri FI_ENODATA=$($python -c 'import errno; print(errno.ENODATA)')
 fi
-declare -ri FI_ENOSYS=$(python -c 'import errno; print(errno.ENOSYS)')
+declare -ri FI_ENOSYS=$($python -c 'import errno; print(errno.ENOSYS)')
 
 neg_unit_tests=(
 	"fi_dgram g00n13s"
diff --git a/deps/libfabric/fabtests/test_configs/efa/efa.exclude b/deps/libfabric/fabtests/test_configs/efa/efa.exclude
index bf34411880443a2310941c4c5ee795667f7b150f..8ec42b4c28a320d01b0c1153eecc24df7dc69e3d 100644
--- a/deps/libfabric/fabtests/test_configs/efa/efa.exclude
+++ b/deps/libfabric/fabtests/test_configs/efa/efa.exclude
@@ -61,9 +61,8 @@ trigger
 #rdm_cntr_pingpong
 
 
-# These tests require ENA IPs for the OOB sync
+# This test requires ENA IPs for the OOB sync
 av_xfer
-multi_recv
 
 # Connection manager isn't supported
 cm_data
diff --git a/deps/libfabric/fabtests/ubertest/ofi_atomic.c b/deps/libfabric/fabtests/ubertest/ofi_atomic.c
index 76a6c79b7a0b1a31401e65b29b30dcb03060a4e4..1737a4981d4bd8a42eb0456cbc09c1b36b5676dc 100644
--- a/deps/libfabric/fabtests/ubertest/ofi_atomic.c
+++ b/deps/libfabric/fabtests/ubertest/ofi_atomic.c
@@ -294,7 +294,7 @@ OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_LXOR)
 OFI_DEFINE_INT_HANDLERS(WRITE, FUNC, OFI_OP_BXOR)
 OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_WRITE)
 
-void (*ofi_atomic_write_handlers[OFI_WRITE_OP_LAST][FI_DATATYPE_LAST])
+void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST])
 	(void *dst, const void *src, size_t cnt) =
 {
 	{ OFI_DEFINE_REALNO_HANDLERS(WRITE, NAME, OFI_OP_MIN) },
@@ -330,7 +330,7 @@ OFI_DEFINE_INT_HANDLERS(READWRITE, FUNC, OFI_OP_BXOR)
 OFI_DEFINE_ALL_HANDLERS(READ, FUNC, OFI_OP_READ)
 OFI_DEFINE_ALL_HANDLERS(READWRITE, FUNC, OFI_OP_WRITE)
 
-void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_LAST][FI_DATATYPE_LAST])
+void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST])
 	(void *dst, const void *src, void *res, size_t cnt) =
 {
 	{ OFI_DEFINE_REALNO_HANDLERS(READWRITE, NAME, OFI_OP_MIN) },
@@ -360,7 +360,7 @@ OFI_DEFINE_REALNO_HANDLERS(CSWAP, FUNC, OFI_OP_CSWAP_GE)
 OFI_DEFINE_REALNO_HANDLERS(CSWAP, FUNC, OFI_OP_CSWAP_GT)
 OFI_DEFINE_INT_HANDLERS(CSWAP, FUNC, OFI_OP_MSWAP)
 
-void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_LAST][FI_DATATYPE_LAST])
+void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST])
 	(void *dst, const void *src, const void *cmp, void *res, size_t cnt) =
 {
 	{ OFI_DEFINE_ALL_HANDLERS(CSWAP, NAME, OFI_OP_CSWAP_EQ) },
diff --git a/deps/libfabric/fabtests/ubertest/ofi_atomic.h b/deps/libfabric/fabtests/ubertest/ofi_atomic.h
index 98966e238d77c05f5092986a70ddf077964869ac..aec830b728f791a18f7aa6c94724e9350dc2521a 100644
--- a/deps/libfabric/fabtests/ubertest/ofi_atomic.h
+++ b/deps/libfabric/fabtests/ubertest/ofi_atomic.h
@@ -44,19 +44,39 @@ typedef float complex ofi_complex_float;
 typedef double complex ofi_complex_double;
 typedef long double complex ofi_complex_long_double;
 
-#define OFI_WRITE_OP_LAST	FI_CSWAP
-#define OFI_READWRITE_OP_LAST	FI_CSWAP
+#define OFI_WRITE_OP_START	FI_MIN
+#define OFI_WRITE_OP_LAST	(FI_ATOMIC_WRITE + 1)
+#define OFI_WRITE_OP_CNT	(OFI_WRITE_OP_LAST - OFI_WRITE_OP_START)
+#define OFI_READWRITE_OP_START	FI_MIN
+#define OFI_READWRITE_OP_LAST	(FI_ATOMIC_WRITE + 1)
+#define OFI_READWRITE_OP_CNT	(OFI_READWRITE_OP_LAST - OFI_READWRITE_OP_START)
 #define OFI_SWAP_OP_START	FI_CSWAP
-#define OFI_SWAP_OP_LAST	(FI_MSWAP - FI_CSWAP + 1)
+#define OFI_SWAP_OP_LAST	(FI_MSWAP + 1)
+#define OFI_SWAP_OP_CNT		(OFI_SWAP_OP_LAST - OFI_SWAP_OP_START)
 
-extern void (*ofi_atomic_write_handlers[OFI_WRITE_OP_LAST][FI_DATATYPE_LAST])
+#define ofi_atomic_iswrite_op(op) \
+	(op >= OFI_WRITE_OP_START && op < OFI_WRITE_OP_LAST && op != FI_ATOMIC_READ)
+#define ofi_atomic_isreadwrite_op(op) \
+	(op >= OFI_READWRITE_OP_START && op < OFI_READWRITE_OP_LAST)
+#define ofi_atomic_isswap_op(op) \
+	(op >= OFI_SWAP_OP_START && op < OFI_SWAP_OP_LAST)
+
+extern void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST])
 			(void *dst, const void *src, size_t cnt);
-extern void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_LAST][FI_DATATYPE_LAST])
+extern void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST])
 			(void *dst, const void *src, void *res, size_t cnt);
-extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_LAST][FI_DATATYPE_LAST])
+extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST])
 			(void *dst, const void *src, const void *cmp,
 			 void *res, size_t cnt);
 
+#define ofi_atomic_write_handler(op, datatype, dst, src, cnt) \
+	ofi_atomic_write_handlers[op][datatype](dst, src, cnt)
+#define ofi_atomic_readwrite_handler(op, datatype, dst, src, res, cnt) \
+	ofi_atomic_readwrite_handlers[op][datatype](dst, src, res, cnt)
+#define ofi_atomic_swap_handler(op, datatype, dst, src, cmp, res, cnt) \
+	ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][datatype](dst, src, \
+								cmp, res, cnt)
+
 #define OFI_DEF_COMPLEX_OPS(type)				\
 static inline int ofi_complex_eq_## type			\
 	(ofi_complex_## type a, ofi_complex_## type b)		\
diff --git a/deps/libfabric/fabtests/ubertest/verify.c b/deps/libfabric/fabtests/ubertest/verify.c
index 08011b5c24245f14ec894326a26cfc9dc35fcdb8..b58fa927461ee876b7c823450295d6c6522b2781 100644
--- a/deps/libfabric/fabtests/ubertest/verify.c
+++ b/deps/libfabric/fabtests/ubertest/verify.c
@@ -132,13 +132,11 @@ static int verify_atomic(void)
 	}
 
 	if (is_compare_func(test_info.class_function)) {
-		ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][type](dst,
-			src, cmp, tmp, count);
+		ofi_atomic_swap_handler(op, type, dst, src, cmp, tmp, count);
 	} else if (is_fetch_func(test_info.class_function)) {
-		ofi_atomic_readwrite_handlers[op][type](dst,
-			src, tmp, count);
+		ofi_atomic_readwrite_handler(op, type, dst, src, tmp, count);
 	} else {
-		ofi_atomic_write_handlers[op][type](dst, src, count);
+		ofi_atomic_write_handler(op, type, dst, src, count);
 	}
 
 	SWITCH_TYPES(type, CHECK_LOCAL, dst, ft_mr_ctrl.buf, count, ret);
diff --git a/deps/libfabric/fabtests/unit/getinfo_test.c b/deps/libfabric/fabtests/unit/getinfo_test.c
index 869470ed84cad09fe5a9fe124572e05a3158d674..e02138c4310efd8b5344f47b4340ede21c90c461 100644
--- a/deps/libfabric/fabtests/unit/getinfo_test.c
+++ b/deps/libfabric/fabtests/unit/getinfo_test.c
@@ -148,6 +148,9 @@ static int validate_bit_combos(char *node, char *service, uint64_t flags,
 		}
 		if (ret)
 			fail++;
+
+		fi_freeinfo(*info);
+		*info = NULL;
 	}
 	ret = 0;
 	printf("(passed)(skipped) (%d)(%d)/%d combinations\n",
@@ -206,10 +209,11 @@ static int init_caps(struct fi_info *hints, uint64_t bits)
 }
 
 #define PRIMARY_TX_CAPS	(FI_MSG | FI_RMA | FI_TAGGED | FI_ATOMIC | \
-			 FI_MULTICAST | FI_NAMED_RX_CTX | FI_HMEM)
+			 FI_MULTICAST | FI_NAMED_RX_CTX | FI_HMEM | \
+			 FI_COLLECTIVE)
 #define PRIMARY_RX_CAPS (FI_MSG | FI_RMA | FI_TAGGED | FI_ATOMIC | \
 			 FI_DIRECTED_RECV | FI_VARIABLE_MSG | \
-			 FI_HMEM)
+			 FI_HMEM | FI_COLLECTIVE)
 
 #define PRIMARY_CAPS (PRIMARY_TX_CAPS | PRIMARY_RX_CAPS)
 #define DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_SHARED_AV)
diff --git a/deps/libfabric/fabtests/unit/mr_cache_evict.c b/deps/libfabric/fabtests/unit/mr_cache_evict.c
new file mode 100644
index 0000000000000000000000000000000000000000..4e72e0f75292b992dbf59990db2ddc7be80db331
--- /dev/null
+++ b/deps/libfabric/fabtests/unit/mr_cache_evict.c
@@ -0,0 +1,844 @@
+/*
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <unistd.h>
+#include <stddef.h>
+#include <sys/mman.h>
+#include <getopt.h>
+#include <fcntl.h>
+#include <time.h>
+#include <limits.h>
+#include <stdio.h>
+#include <malloc.h>
+
+#include "unit_common.h"
+#include "shared.h"
+#include "hmem.h"
+
+/* Supported memory region types. */
+enum alloc_type {
+	MMAP,
+	BRK,
+	SBRK,
+	CUDA,
+	ROCR,
+};
+
+static void *reuse_addr = NULL;
+static char err_buf[512];
+static size_t mr_buf_size = 16384;
+
+/* Given a time value, determine the expected cached time value. The assumption
+ * is the cache value should at least have a CACHE_IMPROVEMENT_PERCENT time
+ * improvement over the original time value.
+ */
+#define CACHE_IMPROVEMENT_PERCENT 80
+#define CACHE_TIME_MAX_VALUE(time) ((time) / 100 * \
+				    (100 - CACHE_IMPROVEMENT_PERCENT))
+
+#define PAGEMAP_ENTRY_SIZE 8
+#define PAGEMAP_PFN_PRESENT (1ULL << 63)
+#define PAGEMAP_PFN_MASK ((1ULL << 55) - 1)
+
+/* Function used to get physical address from a virtual address. Must be root
+ * to read pagemap.
+ */
+static int virt_to_phys(const void *va_addr, uint64_t *phy_addr)
+{
+	int fd;
+	int ret;
+	uint64_t entry;
+	ssize_t read_size;
+	off_t seek_ret;
+	off_t seek_offset;
+	int page_size;
+
+	fd = open("/proc/self/pagemap", O_RDONLY);
+	if (fd == -1) {
+		ret = -errno;
+		FT_UNIT_STRERR(err_buf, "open of /proc/self/pagemap failed",
+			       ret);
+		return ret;
+	}
+
+	page_size = sysconf(_SC_PAGESIZE);
+	if (page_size == -1) {
+		ret = -errno;
+		FT_UNIT_STRERR(err_buf, "sysconf(_SC_PAGESIZE) failed", ret);
+		goto out;
+	}
+
+	/* Map virtual address to offset in pagemap. */
+	seek_offset = (uintptr_t) va_addr / page_size * PAGEMAP_ENTRY_SIZE;
+
+	seek_ret = lseek(fd, seek_offset, SEEK_SET);
+	if (seek_ret == -1) {
+		ret = -errno;
+		FT_UNIT_STRERR(err_buf, "lseek failed", ret);
+		goto out;
+	}
+
+	read_size = read(fd, &entry, sizeof(entry));
+	if (read_size == -1) {
+		ret = -errno;
+		FT_UNIT_STRERR(err_buf, "read failed", ret);
+		goto out;
+	} else if (read_size != sizeof(entry)) {
+		ret = -ENOSPC;
+		FT_UNIT_STRERR(err_buf, "short read", ret);
+		goto out;
+	}
+
+	if (entry & PAGEMAP_PFN_PRESENT) {
+		ret = 0;
+		*phy_addr = (entry & PAGEMAP_PFN_MASK) * page_size;
+	} else {
+		ret = -EFAULT;
+		FT_UNIT_STRERR(err_buf, "Failed to find physical address", ret);
+	}
+
+out:
+	close(fd);
+
+	return ret;
+}
+
+/* Sbrk/brk allocations are only intended to support a single outstanding
+ * allocation at a time. Extra handling of the program break is needed to make
+ * sbrk/brk allocations more flexible including making allocations thread safe.
+ */
+static void sbrk_free(void *ptr)
+{
+	void *cur_brk = (void *) ((uint64_t) ptr + mr_buf_size);
+	void *rewind_brk = ptr;
+
+	FT_DEBUG("Resetting program break from %p to %p", cur_brk, rewind_brk);
+	cur_brk = sbrk(-(intptr_t) mr_buf_size);
+	if (cur_brk == (void *) -1) {
+		FT_UNIT_STRERR(err_buf, "sbrk failed", -errno);
+		return;
+	}
+
+	/* Verify the program break was reset to the expected location. */
+	cur_brk = sbrk(0);
+	if (cur_brk == (void *) -1) {
+		FT_UNIT_STRERR(err_buf, "sbrk failed", -errno);
+		return;
+	}
+
+	if (cur_brk != rewind_brk)
+		FT_UNIT_STRERR(err_buf, "Failed to reset program break",
+			       -ENOMEM);
+}
+
+static void *sbrk_alloc(void)
+{
+	void *prev_brk;
+	void *cur_brk;
+
+	prev_brk = sbrk((intptr_t) mr_buf_size);
+	if (prev_brk == (void *) -1) {
+		FT_UNIT_STRERR(err_buf, "sbrk failed", -errno);
+		return NULL;
+	}
+
+	/* Determine the size of the newly allocated buffer. If this operation
+	 * fails, memory is leaked.
+	 */
+	cur_brk = sbrk(0);
+	if (cur_brk == (void *) -1) {
+		FT_UNIT_STRERR(err_buf, "sbrk failed", -errno);
+		return NULL;
+	}
+
+	FT_DEBUG("Moved program break from %p to %p", prev_brk, cur_brk);
+
+	return prev_brk;
+}
+
+static void brk_free(void *ptr)
+{
+	void *cur_brk = (void *) ((uint64_t) ptr + mr_buf_size);
+	void *rewind_brk = ptr;
+	int ret;
+
+	FT_DEBUG("Resetting program break from %p to %p", cur_brk, rewind_brk);
+	ret = brk(rewind_brk);
+	if (ret) {
+		FT_UNIT_STRERR(err_buf, "brk failed", -errno);
+		return;
+	}
+
+	/* Verify the program break was reset the the expected location. */
+	cur_brk = sbrk(0);
+	if (cur_brk == (void *) -1) {
+		FT_UNIT_STRERR(err_buf, "sbrk failed", -errno);
+		return;
+	}
+
+	if (cur_brk != rewind_brk)
+		FT_UNIT_STRERR(err_buf, "Failed to reset program break",
+			       -ENOMEM);
+}
+
+static void *brk_alloc(void)
+{
+	void *prev_brk;
+	void *cur_brk;
+	int ret;
+
+	/* Use sbrk to determine the current program break. This is needed to
+	 * determine the brk allocation size.
+	 */
+	prev_brk = sbrk(0);
+	if (prev_brk == (void *) -1) {
+		FT_UNIT_STRERR(err_buf, "sbrk failed", -errno);
+		return NULL;
+	}
+
+	cur_brk = (void *) ((intptr_t) prev_brk + mr_buf_size);
+	ret = brk(cur_brk);
+	if (ret) {
+		FT_UNIT_STRERR(err_buf, "brk failed", -errno);
+		return NULL;
+	}
+
+	/* Determine the size of the newly allocated buffer. If this operation
+	 * fails, memory is leaked.
+	 */
+	cur_brk = sbrk(0);
+	if (cur_brk == (void *) -1) {
+		FT_UNIT_STRERR(err_buf, "sbrk failed", -errno);
+		return NULL;
+	}
+
+	FT_DEBUG("Moved program break from %p to %p", prev_brk, cur_brk);
+
+	return prev_brk;
+}
+
+/* Mmap allocations are only intended to support a single outstanding
+ * allocation at a time. Extra handling of the mmap reuse address needs to occur
+ * to make mmap allocations more flexible including making allocations thread
+ * safe.
+ */
+static void mmap_free(void *ptr)
+{
+	if (munmap(ptr, mr_buf_size))
+		FT_UNIT_STRERR(err_buf, "munmap failed", -errno);
+}
+
+static void *mmap_alloc(void)
+{
+	void *ptr;
+	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+
+	/* If a reuse address is defined, request MAP_FIXED to require the mmap
+	 * allocation to reuse this address.
+	 */
+	if (reuse_addr)
+		flags |= MAP_FIXED;
+
+	ptr = mmap(reuse_addr, mr_buf_size, PROT_READ | PROT_WRITE, flags, -1,
+		   0);
+	if (ptr == MAP_FAILED) {
+		FT_UNIT_STRERR(err_buf, "mmap failed", -errno);
+		return NULL;
+	}
+
+	/* Cache this virtual address to reuse for future allocations. */
+	reuse_addr = ptr;
+
+	return ptr;
+}
+
+static void rocr_free(void *ptr)
+{
+	ft_hmem_free(FI_HMEM_ROCR, ptr);
+}
+
+static void *rocr_malloc(void)
+{
+	int ret;
+	void *ptr;
+
+	ret = ft_hmem_alloc(FI_HMEM_ROCR, 0, &ptr, mr_buf_size);
+	if (ret)
+		return NULL;
+	return ptr;
+}
+
+
+static void cuda_free(void *ptr)
+{
+	ft_hmem_free(FI_HMEM_CUDA, ptr);
+}
+
+static void *cuda_malloc(void)
+{
+	int ret;
+	void *ptr;
+
+	ret = ft_hmem_alloc(FI_HMEM_CUDA, 0, &ptr, mr_buf_size);
+	if (ret)
+		return NULL;
+	return ptr;
+}
+
+/* Generic allocation/deallocation function. Only a single allocation of any
+ * type should be outstanding.
+ */
+static void mem_free(void *ptr, enum alloc_type type)
+{
+	switch (type) {
+	case SBRK:
+		sbrk_free(ptr);
+		break;
+	case MMAP:
+		mmap_free(ptr);
+		break;
+	case BRK:
+		brk_free(ptr);
+		break;
+	case CUDA:
+		cuda_free(ptr);
+		break;
+	case ROCR:
+		rocr_free(ptr);
+		break;
+	default:
+		return;
+	}
+
+	FT_DEBUG("Memory freed: va=%p", ptr);
+}
+
+static enum fi_hmem_iface alloc_type_to_iface(enum alloc_type type)
+{
+	switch (type) {
+	case CUDA:
+		return FI_HMEM_CUDA;
+	case ROCR:
+		return FI_HMEM_ROCR;
+	default:
+		return FI_HMEM_SYSTEM;
+	}
+}
+
+/* User defined global mr_buf_size controls allocation size. */
+static void *mem_alloc(enum alloc_type type)
+{
+	uint64_t phys_addr = 0;
+	void *ptr;
+	int ret;
+
+	switch (type) {
+	case SBRK:
+		ptr = sbrk_alloc();
+		break;
+	case MMAP:
+		ptr = mmap_alloc();
+		break;
+	case BRK:
+		ptr = brk_alloc();
+		break;
+	case CUDA:
+		ptr = cuda_malloc();
+		break;
+	case ROCR:
+		ptr = rocr_malloc();
+		break;
+	default:
+		return NULL;
+	}
+
+	if (ptr) {
+		if (geteuid() == 0 &&
+		    alloc_type_to_iface(type) == FI_HMEM_SYSTEM) {
+			/* Perform a write to the buffer to ensure the kernel
+			 * has faulted in a page for this allocation. This will
+			 * help prevent virt_to_phys() from returning an error
+			 * due to no PFN.
+			 */
+			*(uint8_t *) ptr = 0;
+			ret = virt_to_phys(ptr, &phys_addr);
+			if (ret)
+				FT_DEBUG("virt_to_phys() failed: %s",
+					 fi_strerror(-ret));
+		}
+
+		FT_DEBUG("Memory allocated: va=%p size=%lu phys_addr=0x%lx",
+			 ptr, mr_buf_size, phys_addr);
+	}
+
+	return ptr;
+}
+
+/* MR registration function which returns the MR and the elapsed time, in
+ * nanoseconds, to register the MR.
+ */
+static int mr_register(const void *buf, struct fid_mr **mr, int64_t *elapsed,
+		       enum fi_hmem_iface iface)
+{
+	int ret;
+	const struct iovec iov = {
+		.iov_base = (void *) buf,
+		.iov_len = mr_buf_size,
+	};
+	struct fi_mr_attr mr_attr = {
+		.mr_iov = &iov,
+		.iov_count = 1,
+		.access = ft_info_to_mr_access(fi),
+		.requested_key = FT_MR_KEY,
+		.iface = iface,
+	};
+
+	ft_start();
+	ret = fi_mr_regattr(domain, &mr_attr, 0, mr);
+	ft_stop();
+
+	if (ret != FI_SUCCESS) {
+		FT_UNIT_STRERR(err_buf, "fi_mr_reg failed", -errno);
+		return -errno;
+	}
+
+	*elapsed = get_elapsed(&start, &end, NANO);
+
+	return 0;
+}
+
+/* Run a test verifing the eviction MR cache entries. The following is how the
+ * test works:
+ * 1. Prime CPU caches by registering a priming MR. This MR is not used for
+ *    cache measurements.
+ *
+ * 2. Allocate a buffer using mem_alloc() with either MMAP, BRK, or SBRK. The
+ *    mem_alloc() allocator is constructed to return the same virtual address
+ *    during buffer reallocation.
+ *
+ * 3. Measure MR registration time of the mem_alloc() buffer. Since this buffer
+ *    has not been previously registered, the elapsed time for this MR
+ *    registration should be long. This is referred to as the initial MR
+ *    registration time.
+ *
+ * 4. Measure MR registration time of the mem_alloc() buffer again. Since this
+ *    buffer has been previously registered, the elapsed time for this MR
+ *    registration should be significantly less than the initial MR registration
+ *    time. If the registration time is not significantly less, it is assumed
+ *    the provider does not support MR caching, and the test will exit. This
+ *    elapsed time is referred to as the cached MR registration time.
+ *
+ * 5. If the provider supports caching, the mem_alloc() buffer is freed and
+ *    reallocated. Measures are inplace to have the reallocated mem_alloc()
+ *    buffer return the same virtual address. During this this time, the
+ *    provider's MR cache should experience an eviction.
+ *
+ * 6. Measure MR registration time of the mem_alloc() buffer a third time. Since
+ *    the provider should have experienced a MR cache eviction, the elapsed time
+ *    for this MR registration should not be significantly less than the initial
+ *    MR registration time. If this allocation is significantly less, it is
+ *    assumed this MR registration incorrectly found a cached MR entry. This
+ *    elapsed time is referred to as the reallocated MR registration time.
+ */
+static int mr_cache_test(enum alloc_type type)
+{
+	void *prime_buf = NULL;
+	struct fid_mr *prime_mr = NULL;
+	void *buf = NULL;
+	struct fid_mr *mr = NULL;
+	int64_t mr_reg_time;
+	struct fid_mr *cached_mr = NULL;
+	int64_t cached_mr_reg_time;
+	struct fid_mr *realloc_mr = NULL;
+	int64_t realloc_mr_reg_time;
+	int ret;
+	void *prev_buf;
+	int testret = FAIL;
+	enum fi_hmem_iface iface = alloc_type_to_iface(type);
+
+	/* Reallocate the domain to reset the MR cache. */
+	if (!domain) {
+		ret = -EINVAL;
+		FT_UNIT_STRERR(err_buf, "no domain allocated", ret);
+		goto cleanup;
+	}
+
+	ret = fi_close(&domain->fid);
+	if (ret) {
+		FT_UNIT_STRERR(err_buf, "Failed to close the domain", ret);
+		domain = NULL;
+		goto cleanup;
+	}
+
+	ret = fi_domain(fabric, fi, &domain, NULL);
+	if (ret) {
+		FT_UNIT_STRERR(err_buf, "fi_domain failed", ret);
+		domain = NULL;
+		goto cleanup;
+	}
+
+	/* A priming MR registration is used to ensure the first timed MR
+	 * registration does not take into account the setting up of CPU caches.
+	 */
+	switch (iface) {
+	case FI_HMEM_CUDA:
+		prime_buf = cuda_malloc();
+		if (!prime_buf) {
+			ret = -ENOMEM;
+			FT_UNIT_STRERR(err_buf, "cuda_malloc failed", ret);
+			goto cleanup;
+		}
+		break;
+
+	case FI_HMEM_ROCR:
+		prime_buf = rocr_malloc();
+		if (!prime_buf) {
+			ret = -ENOMEM;
+			FT_UNIT_STRERR(err_buf, "rocr_malloc failed", ret);
+			goto cleanup;
+		}
+		break;
+
+	default:
+		prime_buf = malloc(mr_buf_size);
+		if (!prime_buf) {
+			ret = -ENOMEM;
+			FT_UNIT_STRERR(err_buf, "malloc failed", ret);
+			goto cleanup;
+		}
+		break;
+	}
+
+	ret = mr_register(prime_buf, &prime_mr, &mr_reg_time, iface);
+	if (ret) {
+		FT_UNIT_STRERR(err_buf, "mr_register failed", ret);
+		goto cleanup;
+	}
+
+	/* Perform initial MR registration. MR registration elapsed time is
+	 * recorded for future comparision.
+	 */
+	buf = mem_alloc(type);
+	if (!buf) {
+		ret = -ENOMEM;
+		FT_UNIT_STRERR(err_buf, "mem_alloc failed", ret);
+		goto cleanup;
+	}
+
+	ret = mr_register(buf, &mr, &mr_reg_time, iface);
+	if (ret) {
+		FT_UNIT_STRERR(err_buf, "mr_register failed", ret);
+		goto cleanup;
+	}
+
+	FT_DEBUG("Initial MR registration time: %ld nsecs", mr_reg_time);
+
+	/* Perform another allocation using the same buffer. This should hit the
+	 * MR cache.
+	 */
+	ret = mr_register(buf, &cached_mr, &cached_mr_reg_time, iface);
+	if (ret) {
+		FT_UNIT_STRERR(err_buf, "mr_register failed", ret);
+		goto cleanup;
+	}
+
+	FT_DEBUG("Cached MR registration time: %ld nsecs", cached_mr_reg_time);
+
+	/* If cached allocation is not within the expected duration, assume the
+	 * provider does not support MR caching.
+	 */
+	if (cached_mr_reg_time > CACHE_TIME_MAX_VALUE(mr_reg_time)) {
+		ret = -FI_ENOSYS;
+		sprintf(err_buf, "Assuming MR cache not enabled by provider");
+		goto cleanup;
+	}
+
+	/* Free the buffer without freeing the MR. This should result in the MR
+	 * cache evicting/invalidating the MR entry. The buffer will then be
+	 * reallocated and re-registered. The newly registered MR should not
+	 * have been cached.
+	 */
+	prev_buf = buf;
+	mem_free(buf, type);
+
+	buf = mem_alloc(type);
+	if (!buf) {
+		ret = -ENOMEM;
+		FT_UNIT_STRERR(err_buf, "mem_alloc failed", ret);
+		goto cleanup;
+	}
+
+	/* We NEED the same pointer to be returned for this test to be valid. */
+	if (buf != prev_buf) {
+		ret = -EFAULT;
+		FT_UNIT_STRERR(err_buf,
+			       "Failed to reallocate same virtual address",
+			       ret);
+		goto cleanup;
+	}
+
+	/* Verify reallocated MR registration time is close to the initial MR
+	 * registration time and greater than the cached MR registration time.
+	 */
+	ret = mr_register(buf, &realloc_mr, &realloc_mr_reg_time, iface);
+	if (ret) {
+		FT_UNIT_STRERR(err_buf, "mr_register failed", ret);
+		goto cleanup;
+	}
+
+	FT_DEBUG("Reallocated MR registration time: %ld nsecs",
+		 realloc_mr_reg_time);
+
+	if (realloc_mr_reg_time <= CACHE_TIME_MAX_VALUE(mr_reg_time)) {
+		ret = -EEXIST;
+		FT_UNIT_STRERR(err_buf,
+			       "Reallocated MR registration time too low. "
+			       "Cached MR may have been incorrectly used.",
+			       ret);
+	} else {
+		testret = PASS;
+	}
+
+cleanup:
+	if (realloc_mr)
+		fi_close(&realloc_mr->fid);
+
+	if (cached_mr)
+		fi_close(&cached_mr->fid);
+
+	if (mr)
+		fi_close(&mr->fid);
+
+	if (buf)
+		mem_free(buf, type);
+
+	if (prime_mr)
+		fi_close(&prime_mr->fid);
+
+	if (prime_buf) {
+		switch (iface) {
+		case FI_HMEM_CUDA:
+			cuda_free(prime_buf);
+			break;
+
+		case FI_HMEM_ROCR:
+			rocr_free(prime_buf);
+			break;
+
+		default:
+			free(prime_buf);
+			break;
+		}
+	}
+
+	return TEST_RET_VAL(ret, testret);
+}
+
+/* Run tests using MMAP, BRK, and SBRK. */
+static int mr_cache_mmap_test(void)
+{
+	return mr_cache_test(MMAP);
+}
+
+static int mr_cache_brk_test(void)
+{
+	return mr_cache_test(BRK);
+}
+
+static int mr_cache_sbrk_test(void)
+{
+	return mr_cache_test(SBRK);
+}
+
+static int mr_cache_cuda_test(void)
+{
+	int ret;
+
+	if (!(opts.options & FT_OPT_ENABLE_HMEM)) {
+		sprintf(err_buf, "FI_HMEM support not requested");
+		return SKIPPED;
+	}
+
+	ret = ft_hmem_init(FI_HMEM_CUDA);
+	if (ret) {
+		sprintf(err_buf, "ft_hmem_init(FI_HMEM_CUDA) failed");
+		return TEST_RET_VAL(ret, FAIL);
+	}
+
+	ret = mr_cache_test(CUDA);
+
+	ft_hmem_cleanup(FI_HMEM_CUDA);
+
+	return ret;
+}
+
+static int mr_cache_rocr_test(void)
+{
+	int ret;
+
+	if (!(opts.options & FT_OPT_ENABLE_HMEM)) {
+		sprintf(err_buf, "FI_HMEM support not requested");
+		return SKIPPED;
+	}
+
+	ret = ft_hmem_init(FI_HMEM_ROCR);
+	if (ret) {
+		sprintf(err_buf, "ft_hmem_init(FI_HMEM_ROCR) failed");
+		return TEST_RET_VAL(ret, FAIL);
+	}
+
+	ret = mr_cache_test(ROCR);
+
+	ft_hmem_cleanup(FI_HMEM_ROCR);
+
+	return ret;
+}
+
+struct test_entry test_array[] = {
+	TEST_ENTRY(mr_cache_mmap_test, "MR cache eviction test using MMAP"),
+	TEST_ENTRY(mr_cache_brk_test, "MR cache eviction test using BRK"),
+	TEST_ENTRY(mr_cache_sbrk_test, "MR cache eviction test using SBRK"),
+	TEST_ENTRY(mr_cache_cuda_test, "MR cache eviction test using CUDA"),
+	TEST_ENTRY(mr_cache_rocr_test, "MR cache eviction test using ROCR"),
+	{ NULL, "" }
+};
+
+static void usage(void)
+{
+	ft_unit_usage("fi_mr_cache_evict",
+		"Test a provider's ability to evict MR cache entries.\n"
+		"Evictions are verified using MMAP, BRK, SBRK, CUDA and ROCR\n"
+		"allocations. FI_HMEM support must be enabled to run CUDA and\n"
+		"ROCR tests.\n\n"
+		"With debug enabled, when running as root, the physical \n"
+		"address of the first page of the MMAP, BRK, and SBRK \n"
+		"allocation is returned. This can be used to verify the \n"
+		"underlying physical memory changes between MMAP, BRK, and \n"
+		"SBRK allocations. When running as non-root, the reported \n"
+		"physical address is always zero.");
+	FT_PRINT_OPTS_USAGE("-s <bytes>", "Memory region size to be tested.");
+	FT_PRINT_OPTS_USAGE("-H", "Enable provider FI_HMEM support");
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+	int op;
+	int failed = 0;
+
+	/* Force malloc to use mmap by setting M_MMAP_THRESHOLD to 1. This
+	 * allows for this application to control the program break. Note that
+	 * not all operating systems may support this call. Thus, failure of
+	 * mallopt() is not treated as an error. But, this could impact the
+	 * results of the test.
+	 */
+	ret = mallopt(M_MMAP_THRESHOLD, 1);
+	if (ret != 1)
+		FT_PRINTERR("Failed to set M_MMAP_THRESHOLD to 1. "
+			    "System may not support M_MMAP_THRESHOLD. "
+			    "Proceeding with test.", -EINVAL);
+
+	hints = fi_allocinfo();
+	if (!hints)
+		return EXIT_FAILURE;
+
+	while ((op = getopt(argc, argv, FAB_OPTS "h" "s:")) != -1) {
+		switch (op) {
+		default:
+			ft_parseinfo(op, optarg, hints, &opts);
+			break;
+		case 's':
+			errno = 0;
+			mr_buf_size = strtoul(optarg, NULL, 10);
+			if (mr_buf_size == 0)
+				ret = -EINVAL;
+			else if (mr_buf_size == ULONG_MAX && errno)
+				ret = -errno;
+			else
+				ret = 0;
+
+			if (ret) {
+				FT_PRINTERR("Invalid memory region size", ret);
+				goto out;
+			}
+			break;
+		case '?':
+		case 'h':
+			usage();
+			return EXIT_FAILURE;
+		}
+	}
+
+	hints->mode = ~0;
+	hints->domain_attr->mode = ~0;
+	hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE);
+	hints->caps |= FI_MSG | FI_RMA;
+
+	if (opts.options & FT_OPT_ENABLE_HMEM)
+		hints->caps |= FI_HMEM;
+
+	ret = fi_getinfo(FT_FIVERSION, NULL, 0, 0, hints, &fi);
+	if (ret) {
+		hints->caps &= ~FI_RMA;
+		ret = fi_getinfo(FT_FIVERSION, NULL, 0, 0, hints, &fi);
+		if (ret) {
+			FT_PRINTERR("fi_getinfo", ret);
+			goto out;
+		}
+	}
+
+	if (!ft_info_to_mr_access(fi))
+		goto out;
+
+	if (fi->domain_attr->mr_iov_limit == 0) {
+		ret = -EINVAL;
+		FT_PRINTERR("mr_iov_limit not set", ret);
+		goto out;
+	}
+
+	ret = ft_open_fabric_res();
+	if (ret)
+		goto out;
+
+	printf("Testing MR cache on fabric %s domain %s\n",
+	       fi->fabric_attr->name, fi->domain_attr->name);
+
+	failed = run_tests(test_array, err_buf);
+	if (failed > 0)
+		printf("Summary: %d tests failed\n", failed);
+	else
+		printf("Summary: all tests passed\n");
+
+out:
+	ft_free_res();
+	return ret ? ft_exit_code(ret) : (failed > 0) ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/deps/libfabric/fabtests/unit/mr_test.c b/deps/libfabric/fabtests/unit/mr_test.c
index a7eca891d3366e481bd616e8eb008514628b54d4..c1f94d804f47704f6f2d5896d53fbf23468cab45 100644
--- a/deps/libfabric/fabtests/unit/mr_test.c
+++ b/deps/libfabric/fabtests/unit/mr_test.c
@@ -142,7 +142,7 @@ static int mr_regattr()
 	int testret = FAIL;
 	struct fid_mr *mr;
 	struct iovec *iov;
-	struct fi_mr_attr attr;
+	struct fi_mr_attr attr = {0};
 	char *base;
 
 	attr.access = ft_info_to_mr_access(fi);
diff --git a/deps/libfabric/include/ofi.h b/deps/libfabric/include/ofi.h
index ece4208eab4c4007ebf6bf2ed73f8f6ecd57137d..ee9ee6476849d70225c59cd05ee609d1b4c54256 100644
--- a/deps/libfabric/include/ofi.h
+++ b/deps/libfabric/include/ofi.h
@@ -194,6 +194,7 @@ enum ofi_prov_type {
 struct fi_prov_context {
 	enum ofi_prov_type type;
 	int disable_logging;
+	int disable_layering;
 };
 
 struct fi_filter {
@@ -280,6 +281,8 @@ uint64_t ofi_tag_format(uint64_t max_tag);
 uint8_t ofi_msb(uint64_t num);
 uint8_t ofi_lsb(uint64_t num);
 
+extern size_t ofi_universe_size;
+
 int ofi_send_allowed(uint64_t caps);
 int ofi_recv_allowed(uint64_t caps);
 int ofi_rma_initiate_allowed(uint64_t caps);
@@ -364,6 +367,14 @@ static inline uint32_t ofi_xorshift_random(uint32_t val)
 	return val;
 }
 
+static inline uint32_t ofi_xorshift_random_r(uint32_t *seed)
+{
+	return *seed = ofi_xorshift_random(*seed);
+}
+
+uint32_t ofi_generate_seed(void);
+
+size_t ofi_vrb_speed(uint8_t speed, uint8_t width);
 
 #ifdef __cplusplus
 }
diff --git a/deps/libfabric/include/ofi_atom.h b/deps/libfabric/include/ofi_atom.h
index 8d1b6d9b6258249f3b0e2fa4cdbd7e092493009e..46d6eb98dec01ce71f5892307045de295546a5b9 100644
--- a/deps/libfabric/include/ofi_atom.h
+++ b/deps/libfabric/include/ofi_atom.h
@@ -38,6 +38,7 @@
 #include <assert.h>
 #include <pthread.h>
 #include <stdlib.h>
+#include <stdbool.h>
 
 #include <ofi_lock.h>
 #include <ofi_osd.h>
@@ -123,6 +124,41 @@ typedef atomic_long	ofi_atomic_int64_t;
 		ATOMIC_IS_INITIALIZED(atomic);								\
 		return (int##radix##_t)atomic_fetch_sub_explicit(&atomic->val, val,			\
 								 memory_order_acq_rel) - val;		\
+	}												\
+	/**												\
+	 *  Compare and swap, strong version								\
+	 *												\
+	 *  @return true if atomic matches expected and the change is done, false			\
+	 *   otherwise.											\
+	 */												\
+	static inline											\
+	bool ofi_atomic_cas_bool_strong##radix(ofi_atomic##radix##_t *atomic, 				\
+						      int##radix##_t expected, 				\
+						      int##radix##_t desired)				\
+	{												\
+		ATOMIC_IS_INITIALIZED(atomic);								\
+		return atomic_compare_exchange_strong_explicit(&atomic->val, &expected, desired,	\
+							       memory_order_acq_rel,			\
+							       memory_order_relaxed);			\
+	}												\
+	/**												\
+	 *  Compare and swap, weak version								\
+	 *												\
+	 *  @return true if atomic matches expected and the change is done, false			\
+	 *   otherwise.											\
+	 *   This is the weak version and may incorrectly report a failed match.			\
+	 *   As a result it is most useful in loops that wait until the check succeeds.			\
+	 */												\
+	 static inline											\
+	 bool ofi_atomic_cas_bool_weak##radix(ofi_atomic##radix##_t *atomic, 				\
+					      int##radix##_t expected, 					\
+					      int##radix##_t desired)					\
+	{												\
+		ATOMIC_IS_INITIALIZED(atomic);								\
+		return atomic_compare_exchange_weak_explicit(&atomic->val, 				\
+							     &expected, desired,			\
+							     memory_order_acq_rel,			\
+							     memory_order_relaxed);			\
 	}
 
 #elif defined HAVE_BUILTIN_ATOMICS
@@ -184,8 +220,30 @@ typedef atomic_long	ofi_atomic_int64_t;
 	{												\
 		*(ofi_atomic_ptr(atomic)) = value;							\
 		ATOMIC_INIT(atomic);									\
+	}												\
+	static inline											\
+	bool ofi_atomic_cas_bool##radix(ofi_atomic##radix##_t *atomic, 					\
+					int##radix##_t expected,					\
+					int##radix##_t desired)						\
+	{												\
+		 ATOMIC_IS_INITIALIZED(atomic);								\
+		 return ofi_atomic_cas_bool(radix, ofi_atomic_ptr(atomic), expected, desired);		\
+	}												\
+	static inline											\
+	bool ofi_atomic_cas_bool_strong##radix(ofi_atomic##radix##_t *atomic, 				\
+					       int##radix##_t expected,					\
+					       int##radix##_t desired)					\
+	{												\
+		return ofi_atomic_cas_bool##radix(atomic, expected, desired);				\
+	}												\
+	static inline											\
+	bool ofi_atomic_cas_bool_weak##radix(ofi_atomic##radix##_t *atomic, 				\
+					     int##radix##_t expected,					\
+					     int##radix##_t desired)					\
+	{												\
+		return ofi_atomic_cas_bool##radix(atomic, expected, desired);				\
 	}
-	
+
 #else /* HAVE_ATOMICS */
 
 #define OFI_ATOMIC_DEFINE(radix)								\
@@ -261,7 +319,37 @@ typedef atomic_long	ofi_atomic_int64_t;
 		v = atomic->val;								\
 		fastlock_release(&atomic->lock);						\
 		return v;									\
+	}											\
+	static inline										\
+	bool ofi_atomic_cas_bool##radix(ofi_atomic##radix##_t *atomic,				\
+					int##radix##_t expected,				\
+					int##radix##_t desired)					\
+	{											\
+		bool ret = false;								\
+		ATOMIC_IS_INITIALIZED(atomic);							\
+		fastlock_acquire(&atomic->lock);						\
+		if (atomic->val == expected) {							\
+			atomic->val = desired;							\
+			ret = true;								\
+		}										\
+		fastlock_release(&atomic->lock);						\
+		return ret;									\
+	}											\
+	static inline										\
+	bool ofi_atomic_cas_bool_strong##radix(ofi_atomic##radix##_t *atomic,			\
+							 int##radix##_t expected,		\
+							 int##radix##_t desired)		\
+	{											\
+		return ofi_atomic_cas_bool##radix(atomic, expected, desired);			\
+	}											\
+	static inline										\
+	bool ofi_atomic_cas_bool_weak##radix(ofi_atomic##radix##_t *atomic,			\
+							 int##radix##_t expected,		\
+							 int##radix##_t desired)		\
+	{											\
+		return ofi_atomic_cas_bool##radix(atomic, expected, desired);			\
 	}
+
 #endif // HAVE_ATOMICS
 
 OFI_ATOMIC_DEFINE(32)
diff --git a/deps/libfabric/include/ofi_atomic.h b/deps/libfabric/include/ofi_atomic.h
index a146e75bc20c3f88918ba3b5eed0af1a9f5f89f5..fc956945fd3388c818f388714348cf318884f0ea 100644
--- a/deps/libfabric/include/ofi_atomic.h
+++ b/deps/libfabric/include/ofi_atomic.h
@@ -43,19 +43,42 @@ extern "C" {
 
 size_t ofi_datatype_size(enum fi_datatype datatype);
 
-#define OFI_WRITE_OP_LAST	FI_CSWAP
-#define OFI_READWRITE_OP_LAST	FI_CSWAP
+/* The START value is included, LAST is exclusive, which matches the public
+ * header file use of LAST.  CNT is the number of valid values.
+ */
+#define OFI_WRITE_OP_START	FI_MIN
+#define OFI_WRITE_OP_LAST	(FI_ATOMIC_WRITE + 1)
+#define OFI_WRITE_OP_CNT	(OFI_WRITE_OP_LAST - OFI_WRITE_OP_START)
+#define OFI_READWRITE_OP_START	FI_MIN
+#define OFI_READWRITE_OP_LAST	(FI_ATOMIC_WRITE + 1)
+#define OFI_READWRITE_OP_CNT	(OFI_READWRITE_OP_LAST - OFI_READWRITE_OP_START)
 #define OFI_SWAP_OP_START	FI_CSWAP
-#define OFI_SWAP_OP_LAST	(FI_MSWAP - FI_CSWAP + 1)
+#define OFI_SWAP_OP_LAST	(FI_MSWAP + 1)
+#define OFI_SWAP_OP_CNT		(OFI_SWAP_OP_LAST - OFI_SWAP_OP_START)
+
+#define ofi_atomic_iswrite_op(op) \
+	(op >= OFI_WRITE_OP_START && op < OFI_WRITE_OP_LAST && op != FI_ATOMIC_READ)
+#define ofi_atomic_isreadwrite_op(op) \
+	(op >= OFI_READWRITE_OP_START && op < OFI_READWRITE_OP_LAST)
+#define ofi_atomic_isswap_op(op) \
+	(op >= OFI_SWAP_OP_START && op < OFI_SWAP_OP_LAST)
 
-extern void (*ofi_atomic_write_handlers[OFI_WRITE_OP_LAST][FI_DATATYPE_LAST])
+extern void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST])
 			(void *dst, const void *src, size_t cnt);
-extern void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_LAST][FI_DATATYPE_LAST])
+extern void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST])
 			(void *dst, const void *src, void *res, size_t cnt);
-extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_LAST][FI_DATATYPE_LAST])
+extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST])
 			(void *dst, const void *src, const void *cmp,
 			 void *res, size_t cnt);
 
+#define ofi_atomic_write_handler(op, datatype, dst, src, cnt) \
+	ofi_atomic_write_handlers[op][datatype](dst, src, cnt)
+#define ofi_atomic_readwrite_handler(op, datatype, dst, src, res, cnt) \
+	ofi_atomic_readwrite_handlers[op][datatype](dst, src, res, cnt)
+#define ofi_atomic_swap_handler(op, datatype, dst, src, cmp, res, cnt) \
+	ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][datatype](dst, src, \
+								cmp, res, cnt)
+
 int ofi_atomic_valid(const struct fi_provider *prov,
 		     enum fi_datatype datatype, enum fi_op op, uint64_t flags);
 
diff --git a/deps/libfabric/include/ofi_bitmask.h b/deps/libfabric/include/ofi_bitmask.h
index 2591b223ba825ed52545e8e0d4b9f48108ecc02c..067fa38994f305fc53d57a2a1cdbb7bb9a891541 100644
--- a/deps/libfabric/include/ofi_bitmask.h
+++ b/deps/libfabric/include/ofi_bitmask.h
@@ -90,7 +90,7 @@ static inline void ofi_bitmask_set_all(struct bitmask *mask)
 
 static inline size_t ofi_bitmask_get_lsbset(struct bitmask mask)
 {
-	int cur;
+	size_t cur;
 	uint8_t tmp;
 	size_t ret = 0;
 
diff --git a/deps/libfabric/include/ofi_cuda.h b/deps/libfabric/include/ofi_cuda.h
index cedaf930cf997591ca973d4ab0dd51eb0ce63a71..564116a40e219b067d28c6ac4d9f895c5095a9d5 100644
--- a/deps/libfabric/include/ofi_cuda.h
+++ b/deps/libfabric/include/ofi_cuda.h
@@ -36,7 +36,7 @@
 
 #ifndef _OFI_CUDA_H_
 #define _OFI_CUDA_H_
-#ifdef HAVE_LIBCUDA
+#if HAVE_LIBCUDA
 
 #include <cuda.h>
 #include <cuda_runtime.h>
diff --git a/deps/libfabric/include/ofi_hmem.h b/deps/libfabric/include/ofi_hmem.h
new file mode 100644
index 0000000000000000000000000000000000000000..789b261c376b6997ca8b39d46244c2e1fd81940a
--- /dev/null
+++ b/deps/libfabric/include/ofi_hmem.h
@@ -0,0 +1,154 @@
+/*
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _OFI_HMEM_H_
+#define _OFI_HMEM_H_
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <rdma/fi_domain.h>
+#include <stdbool.h>
+
+#if HAVE_LIBCUDA
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+/* Libfabric supported CUDA operations. */
+cudaError_t ofi_cudaMemcpy(void* dst, const void* src, size_t count,
+			   enum cudaMemcpyKind kind);
+const char *ofi_cudaGetErrorName(cudaError_t error);
+const char *ofi_cudaGetErrorString(cudaError_t error);
+CUresult ofi_cuPointerGetAttribute(void *data, CUpointer_attribute attribute,
+				   CUdeviceptr ptr);
+CUresult ofi_cuInit(unsigned int flags);
+
+#endif /* HAVE_LIBCUDA */
+
+#ifdef HAVE_ROCR
+
+#include <hsa/hsa_ext_amd.h>
+
+/* Libfabric support ROCr operations. */
+
+hsa_status_t ofi_hsa_memory_copy(void *dst, const void *src, size_t size);
+hsa_status_t ofi_hsa_amd_pointer_info(void *ptr, hsa_amd_pointer_info_t *info,
+				      void *(*alloc)(size_t),
+				      uint32_t *num_agents_accessible,
+				      hsa_agent_t **accessible);
+hsa_status_t ofi_hsa_init(void);
+hsa_status_t ofi_hsa_shut_down(void);
+hsa_status_t ofi_hsa_status_string(hsa_status_t status,
+				   const char **status_string);
+const char *ofi_hsa_status_to_string(hsa_status_t status);
+
+hsa_status_t ofi_hsa_amd_dereg_dealloc_cb(void *ptr,
+					  hsa_amd_deallocation_callback_t cb);
+hsa_status_t ofi_hsa_amd_reg_dealloc_cb(void *ptr,
+					hsa_amd_deallocation_callback_t cb,
+					void *user_data);
+
+#endif /* HAVE_ROCR */
+
+int rocr_memcpy(uint64_t device, void *dest, const void *src, size_t size);
+int rocr_hmem_init(void);
+int rocr_hmem_cleanup(void);
+bool rocr_is_addr_valid(const void *addr);
+
+int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size);
+int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size);
+int cuda_hmem_init(void);
+int cuda_hmem_cleanup(void);
+bool cuda_is_addr_valid(const void *addr);
+
+int ze_hmem_copy(uint64_t device, void *dst, const void *src, size_t size);
+int ze_hmem_init(void);
+int ze_hmem_cleanup(void);
+bool ze_is_addr_valid(const void *addr);
+int ze_hmem_get_handle(void *dev_buf, void **handle);
+int ze_hmem_open_handle(void **handle, uint64_t device, void **ipc_ptr);
+int ze_hmem_close_handle(void *ipc_ptr);
+
+static inline int ofi_memcpy(uint64_t device, void *dest, const void *src,
+			     size_t size)
+{
+	memcpy(dest, src, size);
+	return FI_SUCCESS;
+}
+
+static inline int ofi_hmem_init_noop(void)
+{
+	return FI_SUCCESS;
+}
+
+static inline int ofi_hmem_cleanup_noop(void)
+{
+	return FI_SUCCESS;
+}
+
+static inline int ofi_hmem_no_get_handle(void *dev_buffer, void **handle)
+{
+	return -FI_ENOSYS;
+}
+
+static inline int ofi_hmem_no_open_handle(void **handle, uint64_t device, void **ipc_ptr)
+{
+	return -FI_ENOSYS;
+}
+
+static inline int ofi_hmem_no_close_handle(void *ipc_ptr)
+{
+	return -FI_ENOSYS;
+}
+
+ssize_t ofi_copy_from_hmem_iov(void *dest, size_t size,
+			       enum fi_hmem_iface hmem_iface, uint64_t device,
+			       const struct iovec *hmem_iov,
+			       size_t hmem_iov_count, uint64_t hmem_iov_offset);
+
+ssize_t ofi_copy_to_hmem_iov(enum fi_hmem_iface hmem_iface, uint64_t device,
+			     const struct iovec *hmem_iov,
+			     size_t hmem_iov_count, uint64_t hmem_iov_offset,
+			     void *src, size_t size);
+
+int ofi_hmem_get_handle(enum fi_hmem_iface iface, void *dev_buf, void **handle);
+int ofi_hmem_open_handle(enum fi_hmem_iface iface, void **handle,
+			 uint64_t device, void **ipc_ptr);
+int ofi_hmem_close_handle(enum fi_hmem_iface iface, void *ipc_ptr);
+
+void ofi_hmem_init(void);
+void ofi_hmem_cleanup(void);
+enum fi_hmem_iface ofi_get_hmem_iface(const void *addr);
+
+#endif /* _OFI_HMEM_H_ */
diff --git a/deps/libfabric/include/ofi_indexer.h b/deps/libfabric/include/ofi_indexer.h
index 2891a64aa78ed705dab8cdde2c938534fad4a4c6..250f95644f2475366944a5b798b8a1da7f4fdd31 100644
--- a/deps/libfabric/include/ofi_indexer.h
+++ b/deps/libfabric/include/ofi_indexer.h
@@ -38,6 +38,7 @@
 #include "config.h"
 
 #include <sys/types.h>
+#include <stdbool.h>
 
 /*
  * Indexer:
@@ -79,6 +80,7 @@ struct indexer
 
 int ofi_idx_insert(struct indexer *idx, void *item);
 void *ofi_idx_remove(struct indexer *idx, int index);
+void *ofi_idx_remove_ordered(struct indexer *idx, int index);
 void ofi_idx_replace(struct indexer *idx, int index, void *item);
 void ofi_idx_reset(struct indexer *idx);
 
@@ -97,6 +99,10 @@ static inline void *ofi_idx_lookup(struct indexer *idx, int index)
 	return ofi_idx_is_valid(idx, index) ? ofi_idx_at(idx, index) : NULL;
 }
 
+static inline bool ofi_idx_free_list_empty(struct indexer *idx)
+{
+	return (idx->free_list == 0);
+}
 /*
  * Index map:
  * The index map is similar in concept to the indexer.  It allows the user
diff --git a/deps/libfabric/include/ofi_lock.h b/deps/libfabric/include/ofi_lock.h
index afde786b0e0cc174854eeb314ced752600c0a572..91b8dfadf76bf6997e35945d9c0a2f9cb58281a9 100644
--- a/deps/libfabric/include/ofi_lock.h
+++ b/deps/libfabric/include/ofi_lock.h
@@ -161,6 +161,8 @@ static inline void ofi_fastlock_acquire_noop(fastlock_t *lock)
 	/* These non-op routines must be used only by a single-threaded code*/
 	assert(!lock->in_use);
 	lock->in_use = 1;
+#else
+    (void)lock;
 #endif
 }
 static inline void ofi_fastlock_release_noop(fastlock_t *lock)
@@ -168,6 +170,8 @@ static inline void ofi_fastlock_release_noop(fastlock_t *lock)
 #if ENABLE_DEBUG
 	assert(lock->in_use);
 	lock->in_use = 0;
+#else
+    (void)lock;
 #endif
 }
 
diff --git a/deps/libfabric/include/ofi_mem.h b/deps/libfabric/include/ofi_mem.h
index 92f3123626c331f1664486173d389df2a0f813ec..b3bd951018d7e9cc49b5124462280fc34ded2134 100644
--- a/deps/libfabric/include/ofi_mem.h
+++ b/deps/libfabric/include/ofi_mem.h
@@ -326,6 +326,7 @@ struct ofi_bufpool_region {
 	size_t				index;
 	void 				*context;
 	struct ofi_bufpool 		*pool;
+	int				flags;
 #ifndef NDEBUG
 	size_t 				use_cnt;
 #endif
diff --git a/deps/libfabric/include/ofi_mr.h b/deps/libfabric/include/ofi_mr.h
index 0ff553745ca6cdf812b32be70458024bbc18b862..149b534611f2db17880db01688fb718e8ef2869e 100644
--- a/deps/libfabric/include/ofi_mr.h
+++ b/deps/libfabric/include/ofi_mr.h
@@ -1,6 +1,8 @@
 /*
  * Copyright (c) 2017-2019 Intel Corporation, Inc. All rights reserved.
- * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates.
+ *                         All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -46,9 +48,12 @@
 #include <ofi_lock.h>
 #include <ofi_list.h>
 #include <ofi_tree.h>
+#include <ofi_hmem.h>
 
 struct ofi_mr_info {
 	struct iovec iov;
+	enum fi_hmem_iface iface;
+	uint64_t device;
 };
 
 
@@ -96,36 +101,73 @@ static inline uint64_t ofi_mr_get_prov_mode(uint32_t version,
 }
 
 
+/* Single lock used by all memory monitors and MR caches. */
+extern pthread_mutex_t mm_lock;
+/* The read-write lock is an additional lock used to protect the dlist_entry
+ * list of ofi_mem_monitor. Due to the necessity of releasing the mm_lock
+ * while walking the dlist in ofi_monitor_notify, we need a separate lock to
+ * ensure thread safety. This must be a read-write lock because
+ * ofi_monitor_notify may be recursive and cannot block multiple walks from
+ * occurring at the same time.
+ */
+extern pthread_rwlock_t mm_list_rwlock;
+
 /*
  * Memory notifier - Report memory mapping changes to address ranges
  */
 
 struct ofi_mr_cache;
 
+union ofi_mr_hmem_info {
+	uint64_t cuda_id;
+};
+
 struct ofi_mem_monitor {
-	pthread_mutex_t 		lock;
 	struct dlist_entry		list;
+	enum fi_hmem_iface		iface;
 
+	void (*init)(struct ofi_mem_monitor *monitor);
+	void (*cleanup)(struct ofi_mem_monitor *monitor);
+	int (*start)(struct ofi_mem_monitor *monitor);
+	void (*stop)(struct ofi_mem_monitor *monitor);
 	int (*subscribe)(struct ofi_mem_monitor *notifier,
-			 const void *addr, size_t len);
+			 const void *addr, size_t len,
+			 union ofi_mr_hmem_info *hmem_info);
 	void (*unsubscribe)(struct ofi_mem_monitor *notifier,
-			    const void *addr, size_t len);
+			    const void *addr, size_t len,
+			    union ofi_mr_hmem_info *hmem_info);
+
+	/* Valid is a memory monitor operation used to query a memory monitor to
+	 * see if the memory monitor's view of the buffer is still valid. If the
+	 * memory monitor's view of the buffer is no longer valid (e.g. the
+	 * pages behind a given virtual address have changed), the buffer needs
+	 * to be re-registered.
+	 */
+	bool (*valid)(struct ofi_mem_monitor *notifier, const void *addr,
+		      size_t len, union ofi_mr_hmem_info *hmem_info);
 };
 
-void ofi_monitor_init(void);
-void ofi_monitor_cleanup(void);
-int ofi_monitor_add_cache(struct ofi_mem_monitor *monitor,
+void ofi_monitor_init(struct ofi_mem_monitor *monitor);
+void ofi_monitor_cleanup(struct ofi_mem_monitor *monitor);
+void ofi_monitors_init(void);
+void ofi_monitors_cleanup(void);
+int ofi_monitors_add_cache(struct ofi_mem_monitor **monitors,
 			   struct ofi_mr_cache *cache);
-void ofi_monitor_del_cache(struct ofi_mr_cache *cache);
+void ofi_monitors_del_cache(struct ofi_mr_cache *cache);
 void ofi_monitor_notify(struct ofi_mem_monitor *monitor,
 			const void *addr, size_t len);
+void ofi_monitor_flush(struct ofi_mem_monitor *monitor);
 
 int ofi_monitor_subscribe(struct ofi_mem_monitor *monitor,
-			  const void *addr, size_t len);
+			  const void *addr, size_t len,
+			  union ofi_mr_hmem_info *hmem_info);
 void ofi_monitor_unsubscribe(struct ofi_mem_monitor *monitor,
-			     const void *addr, size_t len);
+			     const void *addr, size_t len,
+			     union ofi_mr_hmem_info *hmem_info);
 
 extern struct ofi_mem_monitor *default_monitor;
+extern struct ofi_mem_monitor *default_cuda_monitor;
+extern struct ofi_mem_monitor *default_rocr_monitor;
 
 /*
  * Userfault fd memory monitor
@@ -136,9 +178,6 @@ struct ofi_uffd {
 	int				fd;
 };
 
-int ofi_uffd_init(void);
-void ofi_uffd_cleanup(void);
-
 extern struct ofi_mem_monitor *uffd_monitor;
 
 /*
@@ -149,11 +188,11 @@ struct ofi_memhooks {
 	struct dlist_entry		intercept_list;
 };
 
-int ofi_memhooks_init(void);
-void ofi_memhooks_cleanup(void);
-
 extern struct ofi_mem_monitor *memhooks_monitor;
 
+extern struct ofi_mem_monitor *cuda_monitor;
+
+extern struct ofi_mem_monitor *rocr_monitor;
 
 /*
  * Used to store registered memory regions into a lookup map.  This
@@ -193,8 +232,13 @@ struct ofi_mr {
 	struct util_domain *domain;
 	uint64_t key;
 	uint64_t flags;
+	enum fi_hmem_iface iface;
+	uint64_t device;
 };
 
+void ofi_mr_update_attr(uint32_t user_version, uint64_t caps,
+			const struct fi_mr_attr *user_attr,
+			struct fi_mr_attr *cur_abi_attr);
 int ofi_mr_close(struct fid *fid);
 int ofi_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 		   uint64_t flags, struct fid_mr **mr_fid);
@@ -216,6 +260,8 @@ struct ofi_mr_cache_params {
 	size_t				max_cnt;
 	size_t				max_size;
 	char *				monitor;
+	int				cuda_monitor_enabled;
+	int				rocr_monitor_enabled;
 };
 
 extern struct ofi_mr_cache_params	cache_params;
@@ -223,9 +269,9 @@ extern struct ofi_mr_cache_params	cache_params;
 struct ofi_mr_entry {
 	struct ofi_mr_info		info;
 	void				*storage_context;
-	unsigned int			subscribed:1;
 	int				use_cnt;
 	struct dlist_entry		list_entry;
+	union ofi_mr_hmem_info		hmem_info;
 	uint8_t				data[];
 };
 
@@ -251,10 +297,12 @@ struct ofi_mr_storage {
 	void				(*destroy)(struct ofi_mr_storage *storage);
 };
 
+#define OFI_HMEM_MAX 4
+
 struct ofi_mr_cache {
 	struct util_domain		*domain;
-	struct ofi_mem_monitor		*monitor;
-	struct dlist_entry		notify_entry;
+	struct ofi_mem_monitor		*monitors[OFI_HMEM_MAX];
+	struct dlist_entry		notify_entries[OFI_HMEM_MAX];
 	size_t				entry_data_size;
 
 	struct ofi_mr_storage		storage;
@@ -278,13 +326,15 @@ struct ofi_mr_cache {
 							 struct ofi_mr_entry *entry);
 };
 
-int ofi_mr_cache_init(struct util_domain *domain, struct ofi_mem_monitor *monitor,
+int ofi_mr_cache_init(struct util_domain *domain,
+		      struct ofi_mem_monitor **monitors,
 		      struct ofi_mr_cache *cache);
 void ofi_mr_cache_cleanup(struct ofi_mr_cache *cache);
 
 void ofi_mr_cache_notify(struct ofi_mr_cache *cache, const void *addr, size_t len);
 
-bool ofi_mr_cache_flush(struct ofi_mr_cache *cache);
+bool ofi_mr_cache_flush(struct ofi_mr_cache *cache, bool flush_lru);
+
 int ofi_mr_cache_search(struct ofi_mr_cache *cache, const struct fi_mr_attr *attr,
 			struct ofi_mr_entry **entry);
 /**
diff --git a/deps/libfabric/include/ofi_net.h b/deps/libfabric/include/ofi_net.h
index 7a924df2ac5c63b58d905da0816dce0b647291dd..c37805cd765a08f6205944c42ae114191711c02b 100644
--- a/deps/libfabric/include/ofi_net.h
+++ b/deps/libfabric/include/ofi_net.h
@@ -127,11 +127,33 @@ int ofi_discard_socket(SOCKET sock, size_t len);
 
 #define OFI_ADDRSTRLEN (INET6_ADDRSTRLEN + 50)
 
+/*  values taken from librdmacm/rdma_cma.h */
+#define OFI_IB_IP_PS_MASK   0xFFFFFFFFFFFF0000ULL
+#define OFI_IB_IP_PORT_MASK   0x000000000000FFFFULL
+
+struct ofi_sockaddr_ib {
+	unsigned short int  sib_family; /* AF_IB */
+	uint16_t            sib_pkey;
+	uint32_t            sib_flowinfo;
+	uint8_t             sib_addr[16];
+	uint64_t            sib_sid;
+	uint64_t            sib_sid_mask;
+	uint64_t            sib_scope_id;
+};
+
+enum ofi_rdma_port_space {
+	OFI_RDMA_PS_IPOIB = 0x0002,
+	OFI_RDMA_PS_IB    = 0x013F,
+	OFI_RDMA_PS_TCP   = 0x0106,
+	OFI_RDMA_PS_UDP   = 0x0111,
+};
+
 union ofi_sock_ip {
-	struct sockaddr		sa;
-	struct sockaddr_in	sin;
-	struct sockaddr_in6	sin6;
-	uint8_t			align[32];
+	struct sockaddr			sa;
+	struct sockaddr_in		sin;
+	struct sockaddr_in6		sin6;
+	struct ofi_sockaddr_ib		sib;
+	uint8_t				align[48];
 };
 
 struct ofi_addr_list_entry {
@@ -141,6 +163,7 @@ struct ofi_addr_list_entry {
 	size_t			speed;
 	char			net_name[OFI_ADDRSTRLEN];
 	char			ifa_name[OFI_ADDRSTRLEN];
+	uint64_t		comm_caps;
 };
 
 int ofi_addr_cmp(const struct fi_provider *prov, const struct sockaddr *sa1,
@@ -160,6 +183,7 @@ void ofi_free_list_of_addr(struct slist *addr_list);
 #define ofi_sin6_addr(addr) (((struct sockaddr_in6 *)(addr))->sin6_addr)
 #define ofi_sin6_port(addr) (((struct sockaddr_in6 *)(addr))->sin6_port)
 
+#define ofi_sib_addr(addr) (((struct ofi_sockaddr_ib *)(addr))->sib_addr)
 
 static inline size_t ofi_sizeofaddr(const struct sockaddr *addr)
 {
@@ -168,6 +192,8 @@ static inline size_t ofi_sizeofaddr(const struct sockaddr *addr)
 		return sizeof(struct sockaddr_in);
 	case AF_INET6:
 		return sizeof(struct sockaddr_in6);
+	case AF_IB:
+		return sizeof(struct ofi_sockaddr_ib);
 	default:
 		FI_WARN(&core_prov, FI_LOG_CORE, "Unknown address format\n");
 		return 0;
@@ -181,6 +207,8 @@ static inline size_t ofi_sizeofip(const struct sockaddr *addr)
 		return sizeof(struct in_addr);
 	case AF_INET6:
 		return sizeof(struct in6_addr);
+	case AF_IB:
+		return sizeof(ofi_sib_addr(addr));
 	default:
 		FI_WARN(&core_prov, FI_LOG_CORE, "Unknown address format\n");
 		return 0;
@@ -203,7 +231,7 @@ static inline int ofi_translate_addr_format(int family)
 
 uint16_t ofi_get_sa_family(const struct fi_info *info);
 
-static inline int ofi_ipv4_is_any_addr(struct sockaddr *sa)
+static inline int ofi_sin_is_any_addr(struct sockaddr *sa)
 {
 	struct in_addr ia_any = {
 		.s_addr = INADDR_ANY,
@@ -216,7 +244,7 @@ static inline int ofi_ipv4_is_any_addr(struct sockaddr *sa)
 
 }
 
-static inline int ofi_ipv6_is_any_addr(struct sockaddr *sa)
+static inline int ofi_sin6_is_any_addr(struct sockaddr *sa)
 {
 	struct in6_addr ia6_any = IN6ADDR_ANY_INIT;
 
@@ -226,6 +254,16 @@ static inline int ofi_ipv6_is_any_addr(struct sockaddr *sa)
 	return !memcmp(&ofi_sin6_addr(sa), &ia6_any, sizeof(ia6_any));
 }
 
+static inline int ofi_sib_is_any_addr(struct sockaddr *sa)
+{
+	struct in6_addr ia6_any = IN6ADDR_ANY_INIT;
+
+	if (!sa)
+		return 0;
+
+	return !memcmp(&ofi_sib_addr(sa), &ia6_any, sizeof(ia6_any));
+}
+
 static inline int ofi_is_any_addr(struct sockaddr *sa)
 {
 	if (!sa)
@@ -233,9 +271,11 @@ static inline int ofi_is_any_addr(struct sockaddr *sa)
 
 	switch(sa->sa_family) {
 	case AF_INET:
-		return ofi_ipv4_is_any_addr(sa);
+		return ofi_sin_is_any_addr(sa);
 	case AF_INET6:
-		return ofi_ipv6_is_any_addr(sa);
+		return ofi_sin6_is_any_addr(sa);
+	case AF_IB:
+		return ofi_sib_is_any_addr(sa);
 	default:
 		FI_WARN(&core_prov, FI_LOG_CORE, "Unknown address format!\n");
 		return 0;
@@ -252,6 +292,8 @@ static inline uint16_t ofi_addr_get_port(const struct sockaddr *addr)
 		return ntohs(ofi_sin_port((const struct sockaddr_in *) addr));
 	case AF_INET6:
 		return ntohs(ofi_sin6_port((const struct sockaddr_in6 *) addr));
+	case AF_IB:
+		return (uint16_t)ntohll(((const struct ofi_sockaddr_ib *)addr)->sib_sid);
 	default:
 		FI_WARN(&core_prov, FI_LOG_FABRIC, "Unknown address format\n");
 		assert(0);
@@ -261,6 +303,8 @@ static inline uint16_t ofi_addr_get_port(const struct sockaddr *addr)
 
 static inline void ofi_addr_set_port(struct sockaddr *addr, uint16_t port)
 {
+	struct ofi_sockaddr_ib *sib;
+
 	switch (ofi_sa_family(addr)) {
 	case AF_INET:
 		ofi_sin_port(addr) = htons(port);
@@ -268,6 +312,11 @@ static inline void ofi_addr_set_port(struct sockaddr *addr, uint16_t port)
 	case AF_INET6:
 		ofi_sin6_port(addr) = htons(port);
 		break;
+    case AF_IB:
+		sib = (struct ofi_sockaddr_ib *)addr;
+		sib->sib_sid = htonll(((uint64_t)OFI_RDMA_PS_IB << 16) + ntohs(port));
+		sib->sib_sid_mask = htonll(OFI_IB_IP_PS_MASK | OFI_IB_IP_PORT_MASK);
+		break;
 	default:
 		FI_WARN(&core_prov, FI_LOG_FABRIC, "Unknown address format\n");
 		assert(0);
@@ -281,6 +330,8 @@ static inline void * ofi_get_ipaddr(const struct sockaddr *addr)
 		return &ofi_sin_addr((const struct sockaddr_in *) addr);
 	case AF_INET6:
 		return &ofi_sin6_addr((const struct sockaddr_in6 *) addr);
+	case AF_IB:
+		return &ofi_sib_addr((const struct ofi_sockaddr_ib *) addr);
 	default:
 		return NULL;
 	}
@@ -299,6 +350,9 @@ static inline int ofi_equals_ipaddr(const struct sockaddr *addr1,
 	case AF_INET6:
 	        return !memcmp(&ofi_sin6_addr(addr1), &ofi_sin6_addr(addr2),
 				sizeof(ofi_sin6_addr(addr1)));
+	case AF_IB:
+	        return !memcmp(&ofi_sib_addr(addr1), &ofi_sib_addr(addr2),
+				sizeof(ofi_sib_addr(addr1)));
 	default:
 		return 0;
 	}
@@ -323,6 +377,7 @@ size_t ofi_mask_addr(struct sockaddr *maskaddr, const struct sockaddr *srcaddr,
  */
 const char *ofi_straddr(char *buf, size_t *len,
 			uint32_t addr_format, const void *addr);
+uint32_t ofi_addr_format(const char *str);
 
 /* Returns allocated address to caller.  Caller must free.  */
 int ofi_str_toaddr(const char *str, uint32_t *addr_format,
diff --git a/deps/libfabric/include/ofi_shm.h b/deps/libfabric/include/ofi_shm.h
index 546674878d67e3b189f804406108d03461880d11..db388488ae1b43baf8cb45c14b9eeb747ca8fffc 100644
--- a/deps/libfabric/include/ofi_shm.h
+++ b/deps/libfabric/include/ofi_shm.h
@@ -42,6 +42,7 @@
 #include <ofi_proto.h>
 #include <ofi_mem.h>
 #include <ofi_rbuf.h>
+#include <ofi_tree.h>
 
 #include <rdma/providers/fi_prov.h>
 
@@ -74,8 +75,13 @@ enum {
 	smr_src_iov,	/* reference iovec via CMA */
 	smr_src_mmap,	/* mmap-based fallback protocol */
 	smr_src_sar,	/* segmentation fallback protocol */
+	smr_src_ipc,	/* device IPC handle protocol */
 };
 
+//reserves 0-255 for defined ops and room for new ops
+//256 and beyond reserved for ctrl ops
+#define SMR_OP_MAX (1 << 8) 
+
 #define SMR_REMOTE_CQ_DATA	(1 << 0)
 #define SMR_RMA_REQ		(1 << 1)
 #define SMR_TX_COMPLETION	(1 << 2)
@@ -91,7 +97,7 @@ enum {
 
 /* 
  * Unique smr_op_hdr for smr message protocol:
- * 	addr - local fi_addr of peer sending msg (for shm lookup)
+ * 	addr - local shm_id of peer sending msg (for shm lookup)
  * 	op - type of op (ex. ofi_op_msg, defined in ofi_proto.h)
  * 	op_src - msg src (ex. smr_src_inline, defined above)
  * 	op_flags - operation flags (ex. SMR_REMOTE_CQ_DATA, defined above)
@@ -100,7 +106,7 @@ enum {
  */
 struct smr_msg_hdr {
 	uint64_t		msg_id;
-	fi_addr_t		addr;
+	int64_t			id;
 	uint32_t		op;
 	uint16_t		op_src;
 	uint16_t		op_flags;
@@ -119,6 +125,15 @@ struct smr_msg_hdr {
 
 #define SMR_MSG_DATA_LEN	(SMR_CMD_SIZE - sizeof(struct smr_msg_hdr))
 #define SMR_COMP_DATA_LEN	(SMR_MSG_DATA_LEN / 2)
+
+#define IPC_HANDLE_SIZE		64
+struct smr_ipc_info {
+	uint64_t	iface;
+	union {
+		uint8_t	ipc_handle[IPC_HANDLE_SIZE];
+	};
+};
+
 union smr_cmd_data {
 	uint8_t			msg[SMR_MSG_DATA_LEN];
 	struct {
@@ -133,6 +148,7 @@ union smr_cmd_data {
 	struct {
 		uint64_t	sar;
 	};
+	struct smr_ipc_info	ipc_info;
 };
 
 struct smr_cmd_msg {
@@ -162,14 +178,17 @@ struct smr_cmd {
 #define SMR_COMP_INJECT_SIZE	(SMR_INJECT_SIZE / 2)
 #define SMR_SAR_SIZE		16384
 
+#define SMR_NAME_MAX		256
+
 struct smr_addr {
-	char		name[NAME_MAX];
-	fi_addr_t	addr;
+	char		name[SMR_NAME_MAX];
+	int64_t		id;
 };
 
 struct smr_peer_data {
 	struct smr_addr		addr;
-	uint64_t		sar_status;
+	uint32_t		sar_status;
+	uint32_t		name_sent;
 };
 
 extern struct dlist_entry ep_name_list;
@@ -178,21 +197,24 @@ extern pthread_mutex_t ep_list_lock;
 struct smr_region;
 
 struct smr_ep_name {
-	char name[NAME_MAX];
+	char name[SMR_NAME_MAX];
 	struct smr_region *region;
 	struct dlist_entry entry;
 };
 
 struct smr_peer {
 	struct smr_addr		peer;
+	fi_addr_t		fiaddr;
 	struct smr_region	*region;
 };
 
 #define SMR_MAX_PEERS	256
 
 struct smr_map {
-	fastlock_t	lock;
-	struct smr_peer	peers[SMR_MAX_PEERS];
+	fastlock_t		lock;
+	int64_t			cur_id;
+	struct ofi_rbmap	rbmap;
+	struct smr_peer		peers[SMR_MAX_PEERS];
 };
 
 struct smr_region {
@@ -309,18 +331,18 @@ int	smr_map_create(const struct fi_provider *prov, int peer_count,
 		       struct smr_map **map);
 int	smr_map_to_region(const struct fi_provider *prov,
 			  struct smr_peer *peer_buf);
-void	smr_map_to_endpoint(struct smr_region *region, int index);
-void	smr_unmap_from_endpoint(struct smr_region *region, int index);
+void	smr_map_to_endpoint(struct smr_region *region, int64_t id);
+void	smr_unmap_from_endpoint(struct smr_region *region, int64_t id);
 void	smr_exchange_all_peers(struct smr_region *region);
 int	smr_map_add(const struct fi_provider *prov,
-		    struct smr_map *map, const char *name, int id);
-void	smr_map_del(struct smr_map *map, int id);
+		    struct smr_map *map, const char *name, int64_t *id);
+void	smr_map_del(struct smr_map *map, int64_t id);
 void	smr_map_free(struct smr_map *map);
 
-struct smr_region *smr_map_get(struct smr_map *map, int id);
+struct smr_region *smr_map_get(struct smr_map *map, int64_t id);
 
 int	smr_create(const struct fi_provider *prov, struct smr_map *map,
-		   const struct smr_attr *attr, struct smr_region **smr);
+		   const struct smr_attr *attr, struct smr_region *volatile *smr);
 void	smr_free(struct smr_region *smr);
 
 #ifdef __cplusplus
diff --git a/deps/libfabric/include/ofi_signal.h b/deps/libfabric/include/ofi_signal.h
index 43210162377754485091e0f0933f91f61426cb15..fa1fabdc74a4e09c6b3cdcd72d95a467d3b275c3 100644
--- a/deps/libfabric/include/ofi_signal.h
+++ b/deps/libfabric/include/ofi_signal.h
@@ -39,13 +39,14 @@
 #include <unistd.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <stdbool.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 
 #include <ofi_file.h>
 #include <ofi_osd.h>
+#include <ofi_atom.h>
 #include <rdma/fi_errno.h>
-#include <ofi_lock.h>
 
 
 enum {
@@ -53,11 +54,20 @@ enum {
 	FI_WRITE_FD
 };
 
+enum ofi_signal_state {
+	OFI_SIGNAL_UNSET,
+	OFI_SIGNAL_WRITE_PREPARE,
+	OFI_SIGNAL_SET,
+	OFI_SIGNAL_READ_PREPARE,
+};
+
 struct fd_signal {
-	fastlock_t	lock;
-	int		rcnt;
-	int		wcnt;
+	ofi_atomic32_t	state;
 	int		fd[2];
+
+#if ENABLE_DEBUG
+	ofi_atomic32_t debug_cnt;
+#endif
 };
 
 static inline int fd_signal_init(struct fd_signal *signal)
@@ -72,10 +82,11 @@ static inline int fd_signal_init(struct fd_signal *signal)
 	if (ret)
 		goto err;
 
-	ret = fastlock_init(&signal->lock);
-	if (ret)
-		goto err;
+	ofi_atomic_initialize32(&signal->state, OFI_SIGNAL_UNSET);
 
+#if ENABLE_DEBUG
+	ofi_atomic_initialize32(&signal->debug_cnt, 0);
+#endif
 	return 0;
 
 err:
@@ -88,30 +99,78 @@ static inline void fd_signal_free(struct fd_signal *signal)
 {
 	ofi_close_socket(signal->fd[0]);
 	ofi_close_socket(signal->fd[1]);
-
-	fastlock_destroy(&signal->lock);
 }
 
 static inline void fd_signal_set(struct fd_signal *signal)
 {
 	char c = 0;
-	fastlock_acquire(&signal->lock);
-	if (signal->wcnt == signal->rcnt) {
-		if (ofi_write_socket(signal->fd[FI_WRITE_FD], &c, sizeof c) == sizeof c)
-			signal->wcnt++;
+	bool cas; /* cas result */
+	int write_rc;
+
+	cas = ofi_atomic_cas_bool_strong32(&signal->state,
+					   OFI_SIGNAL_UNSET,
+					   OFI_SIGNAL_WRITE_PREPARE);
+	if (cas) {
+		write_rc = ofi_write_socket(signal->fd[FI_WRITE_FD], &c,
+					    sizeof c);
+		if (write_rc == sizeof c) {
+#if ENABLE_DEBUG
+			assert(ofi_atomic_inc32(&signal->debug_cnt) == 1);
+#endif
+			ofi_atomic_set32(&signal->state, OFI_SIGNAL_SET);
+		} else {
+			/* XXX: Setting the signal failed, a polling thread
+			 * will not be woken up now and the system might
+			 * get stuck.
+			 * Also, typically this will be totally
+			 * untested code path, as it basically will never
+			 * come up.
+			 */
+			ofi_atomic_set32(&signal->state, OFI_SIGNAL_UNSET);
+		}
 	}
-	fastlock_release(&signal->lock);
 }
 
 static inline void fd_signal_reset(struct fd_signal *signal)
 {
 	char c;
-	fastlock_acquire(&signal->lock);
-	if (signal->rcnt != signal->wcnt) {
-		if (ofi_read_socket(signal->fd[FI_READ_FD], &c, sizeof c) == sizeof c)
-			signal->rcnt++;
-	}
-	fastlock_release(&signal->lock);
+	bool cas; /* cas result */
+	enum ofi_signal_state state;
+	int read_rc;
+
+	do {
+		cas = ofi_atomic_cas_bool_weak32(&signal->state,
+						 OFI_SIGNAL_SET,
+						 OFI_SIGNAL_READ_PREPARE);
+		if (cas) {
+			read_rc = ofi_read_socket(signal->fd[FI_READ_FD], &c,
+						  sizeof c);
+			if (read_rc == sizeof c) {
+#if ENABLE_DEBUG
+				assert(ofi_atomic_dec32(&signal->debug_cnt) == 0);
+#endif
+				ofi_atomic_set32(&signal->state,
+						 OFI_SIGNAL_UNSET);
+				break;
+			} else {
+				ofi_atomic_set32(&signal->state, OFI_SIGNAL_SET);
+
+				/* Avoid spinning forever in this highly
+				 * unlikely code path.
+				 */
+				break;
+			}
+		}
+
+		state = ofi_atomic_get32(&signal->state);
+
+		/* note that this loop also needs to include
+		 * OFI_SIGNAL_WRITE_PREPARE, as the writing thread sets
+		 * the signal to the socket in _WRITE_PREPARE state. The reading
+		 * thread might then race with the writing thread and then
+		 * end up here before the state was switched to OFI_SIGNAL_SET.
+		 */
+	} while (state == OFI_SIGNAL_WRITE_PREPARE || state == OFI_SIGNAL_SET);
 }
 
 static inline int fd_signal_poll(struct fd_signal *signal, int timeout)
diff --git a/deps/libfabric/include/ofi_tree.h b/deps/libfabric/include/ofi_tree.h
index 8793dbc773223d30a5d11e71be5faef239710773..a2efbf622ad9afa48129f73e5cba5ca4d8325089 100644
--- a/deps/libfabric/include/ofi_tree.h
+++ b/deps/libfabric/include/ofi_tree.h
@@ -82,6 +82,7 @@ void ofi_rbmap_init(struct ofi_rbmap *map,
 		int (*compare)(struct ofi_rbmap *map, void *key, void *data));
 void ofi_rbmap_cleanup(struct ofi_rbmap *map);
 
+struct ofi_rbnode *ofi_rbmap_get_root(struct ofi_rbmap *map);
 struct ofi_rbnode *ofi_rbmap_find(struct ofi_rbmap *map, void *key);
 struct ofi_rbnode *ofi_rbmap_search(struct ofi_rbmap *map, void *key,
 		int (*compare)(struct ofi_rbmap *map, void *key, void *data));
diff --git a/deps/libfabric/include/ofi_util.h b/deps/libfabric/include/ofi_util.h
index ac2f320ef322a01130b3f322e7053ee5a4d2b784..3929b0bc968ff5c4ccc666efe67a216b1977e709 100644
--- a/deps/libfabric/include/ofi_util.h
+++ b/deps/libfabric/include/ofi_util.h
@@ -452,10 +452,12 @@ int ofi_wait_fd_open(struct fid_fabric *fabric, struct fi_wait_attr *attr,
 int ofi_wait_add_fd(struct util_wait *wait, int fd, uint32_t events,
 		    ofi_wait_try_func wait_try, void *arg, void *context);
 int ofi_wait_del_fd(struct util_wait *wait, int fd);
+int ofi_wait_fdset_del(struct util_wait_fd *wait_fd, int fd);
 int ofi_wait_add_fid(struct util_wait *wat, fid_t fid, uint32_t events,
 		     ofi_wait_try_func wait_try);
 int ofi_wait_del_fid(struct util_wait *wait, fid_t fid);
 
+
 struct util_wait_yield {
 	struct util_wait	util_wait;
 	int			signal;
@@ -682,7 +684,13 @@ static inline void ofi_cntr_inc(struct util_cntr *cntr)
 struct util_av_entry {
 	ofi_atomic32_t	use_cnt;
 	UT_hash_handle	hh;
-	char		addr[0];
+	/*
+	 * data includes 'addr' and any other additional fields
+	 * associated with av_entry. 'addr' must be the first
+	 * field in 'data' and addr length should be a multiple
+	 * of 8 bytes to ensure alignment of additional fields
+	 */
+	char		data[0];
 };
 
 struct util_av {
@@ -701,12 +709,23 @@ struct util_av {
 	uint64_t		flags;
 	size_t			count;
 	size_t			addrlen;
+	/*
+	 * context_offset is addrlen + offset (required for alignment),
+	 * if addrlen is a multiple of 8 bytes offset will be 0.
+	 */
+	size_t			context_offset;
 	struct dlist_entry	ep_list;
 	fastlock_t		ep_list_lock;
 };
 
 struct util_av_attr {
+	/* Must be a multiple of 8 bytes */
 	size_t	addrlen;
+	/*
+	 * Specify the length of additional fields to be added
+	 * to av_entry other than struct util_av_entry and addr
+	 */
+	size_t  context_len;
 	int	flags;
 };
 
@@ -801,10 +820,10 @@ struct util_eq {
 
 struct util_event {
 	struct slist_entry	entry;
-	int			size;
+	ssize_t			size;
 	int			event;
 	int			err;
-	uint8_t			data[0];
+	uint8_t			data[0]; /* offset should be 8-byte aligned */
 };
 
 int ofi_eq_create(struct fid_fabric *fabric, struct fi_eq_attr *attr,
@@ -931,11 +950,14 @@ static inline int ofi_has_util_prefix(const char *str)
 }
 
 typedef int (*ofi_alter_info_t)(uint32_t version, const struct fi_info *src_info,
+				const struct fi_info *base_info,
 				struct fi_info *dest_info);
 
 int ofi_get_core_info(uint32_t version, const char *node, const char *service,
 		      uint64_t flags, const struct util_prov *util_prov,
-		      const struct fi_info *util_hints, ofi_alter_info_t info_to_core,
+		      const struct fi_info *util_hints,
+		      const struct fi_info *base_attr,
+		      ofi_alter_info_t info_to_core,
 		      struct fi_info **core_info);
 int ofix_getinfo(uint32_t version, const char *node, const char *service,
 		 uint64_t flags, const struct util_prov *util_prov,
@@ -992,6 +1014,27 @@ int ofi_ns_del_local_name(struct util_ns *ns, void *service, void *name);
 void *ofi_ns_resolve_name(struct util_ns *ns, const char *server,
 			  void *service);
 
+
+/* Setup coordination for credit based flow control between core and util.
+ * threshold - When number of available RQ credits > threshold, the send
+ *     handler will be invoked
+ * add_credits - Increments the number of peer RQ credits available
+ * send_handler - Called to have util code send credit message.  If the
+ *     credit message cannot be sent, the credits should be returned to
+ *     the core by calling add_credits.
+ */
+#define OFI_OPS_FLOW_CTRL "ofix_flow_ctrl_v1"
+
+struct ofi_ops_flow_ctrl {
+	size_t	size;
+	void	(*set_threshold)(struct fid_ep *ep, uint64_t threshold);
+	void	(*add_credits)(struct fid_ep *ep, uint64_t credits);
+	int	(*enable)(struct fid_ep *ep);
+	void	(*set_send_handler)(struct fid_domain *domain,
+			ssize_t (*send_handler)(struct fid_ep *ep, uint64_t credits));
+};
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/deps/libfabric/include/osx/osd.h b/deps/libfabric/include/osx/osd.h
index 4280560deb048b6c53948cd6b83498446b4505d4..c0f54d133f6965d77be94bb2293422bff245b0f7 100644
--- a/deps/libfabric/include/osx/osd.h
+++ b/deps/libfabric/include/osx/osd.h
@@ -95,6 +95,26 @@ static inline int ofi_hugepage_enabled(void)
 	return 0;
 }
 
+static inline ssize_t ofi_process_vm_readv(pid_t pid,
+			const struct iovec *local_iov,
+			unsigned long liovcnt,
+			const struct iovec *remote_iov,
+			unsigned long riovcnt,
+			unsigned long flags)
+{
+	return -FI_ENOSYS;
+}
+
+static inline size_t ofi_process_vm_writev(pid_t pid,
+			 const struct iovec *local_iov,
+			 unsigned long liovcnt,
+			 const struct iovec *remote_iov,
+			 unsigned long riovcnt,
+			 unsigned long flags)
+{
+	return -FI_ENOSYS;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/deps/libfabric/include/rdma/fabric.h b/deps/libfabric/include/rdma/fabric.h
index 37389cda67c908ee231a16d6aa0c31e2ce4296f3..bf1ccf0c4180fe4d19a267b9505f6ac7ec025c4f 100644
--- a/deps/libfabric/include/rdma/fabric.h
+++ b/deps/libfabric/include/rdma/fabric.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2013-2017 Intel Corporation. All rights reserved.
  * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -38,6 +39,7 @@
 #include <stddef.h>
 #include <sys/types.h>
 #include <sys/uio.h>
+#include <rdma/fi_errno.h>
 
 #ifdef __GNUC__
 #define FI_DEPRECATED_FUNC __attribute__((deprecated))
@@ -77,7 +79,7 @@ extern "C" {
 #endif
 
 #define FI_MAJOR_VERSION 1
-#define FI_MINOR_VERSION 10
+#define FI_MINOR_VERSION 11
 #define FI_REVISION_VERSION 1
 
 enum {
@@ -152,6 +154,7 @@ typedef struct fid *fid_t;
 #define FI_PEEK			(1ULL << 19)
 #define FI_TRIGGER		(1ULL << 20)
 #define FI_FENCE		(1ULL << 21)
+#define FI_PRIORITY		(1ULL << 22)
 
 #define FI_COMPLETION		(1ULL << 24)
 #define FI_EVENT		FI_COMPLETION
@@ -530,6 +533,8 @@ struct fi_ops {
 	int	(*ops_open)(struct fid *fid, const char *name,
 			    uint64_t flags, void **ops, void *context);
 	int	(*tostr)(const struct fid *fid, char *buf, size_t len);
+	int	(*ops_set)(struct fid *fid, const char *name, uint64_t flags,
+			   void *ops, void *context);
 };
 
 /* All fabric interface descriptors must start with this structure */
@@ -649,6 +654,14 @@ fi_open_ops(struct fid *fid, const char *name, uint64_t flags,
 	return fid->ops->ops_open(fid, name, flags, ops, context);
 }
 
+static inline int
+fi_set_ops(struct fid *fid, const char *name, uint64_t flags,
+	   void *ops, void *context)
+{
+	return FI_CHECK_OP(fid->ops, struct fi_ops, ops_set) ?
+		fid->ops->ops_set(fid, name, flags, ops, context) : -FI_ENOSYS;
+}
+
 enum fi_type {
 	FI_TYPE_INFO,
 	FI_TYPE_EP_TYPE,
@@ -675,6 +688,7 @@ enum fi_type {
 	FI_TYPE_OP_TYPE,
 	FI_TYPE_FID,
 	FI_TYPE_COLLECTIVE_OP,
+	FI_TYPE_HMEM_IFACE,
 };
 
 char *fi_tostr(const void *data, enum fi_type datatype);
diff --git a/deps/libfabric/include/rdma/fi_domain.h b/deps/libfabric/include/rdma/fi_domain.h
index 4f3859f80b75cc198a3f10b4773a8bc5058dd86c..27d6dd398b28157383f6e0953f0f731cf29fa1ab 100644
--- a/deps/libfabric/include/rdma/fi_domain.h
+++ b/deps/libfabric/include/rdma/fi_domain.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2013-2017 Intel Corporation. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -115,6 +116,8 @@ struct fid_mr {
 enum fi_hmem_iface {
 	FI_HMEM_SYSTEM	= 0,
 	FI_HMEM_CUDA,
+	FI_HMEM_ROCR,
+	FI_HMEM_ZE,
 };
 
 struct fi_mr_attr {
@@ -130,6 +133,7 @@ struct fi_mr_attr {
 	union {
 		uint64_t	reserved;
 		int		cuda;
+		int		ze;
 	} device;
 };
 
@@ -138,6 +142,23 @@ struct fi_mr_modify {
 	struct fi_mr_attr	attr;
 };
 
+#define FI_SET_OPS_HMEM_OVERRIDE "hmem_override_ops"
+
+struct fi_hmem_override_ops {
+	size_t	size;
+
+	ssize_t	(*copy_from_hmem_iov)(void *dest, size_t size,
+				      enum fi_hmem_iface iface, uint64_t device,
+				      const struct iovec *hmem_iov,
+				      size_t hmem_iov_count,
+				      uint64_t hmem_iov_offset);
+
+	ssize_t (*copy_to_hmem_iov)(enum fi_hmem_iface iface, uint64_t device,
+				    const struct iovec *hmem_iov,
+				    size_t hmem_iov_count,
+				    uint64_t hmem_iov_offset, const void *src,
+				    size_t size);
+};
 
 #ifdef FABRIC_DIRECT
 #include <rdma/fi_direct_atomic_def.h>
@@ -243,8 +264,9 @@ struct fi_ops_domain {
 	int	(*query_atomic)(struct fid_domain *domain,
 			enum fi_datatype datatype, enum fi_op op,
 			struct fi_atomic_attr *attr, uint64_t flags);
-	int (*query_collective)(struct fid_domain *domain, enum fi_collective_op coll,
-				struct fi_collective_attr *attr, uint64_t flags);
+	int	(*query_collective)(struct fid_domain *domain,
+			enum fi_collective_op coll,
+			struct fi_collective_attr *attr, uint64_t flags);
 };
 
 /* Memory registration flags */
diff --git a/deps/libfabric/include/rdma/providers/fi_log.h b/deps/libfabric/include/rdma/providers/fi_log.h
index a42d725e4eb01404fd810a882eff086860969b65..acf6f246e559394cc9955d0cdd29ed8a50c6c754 100644
--- a/deps/libfabric/include/rdma/providers/fi_log.h
+++ b/deps/libfabric/include/rdma/providers/fi_log.h
@@ -74,9 +74,12 @@ void fi_log(const struct fi_provider *prov, enum fi_log_level level,
 
 #define FI_LOG(prov, level, subsystem, ...)				\
 	do {								\
-		if (fi_log_enabled(prov, level, subsystem))		\
+		if (fi_log_enabled(prov, level, subsystem)) {		\
+			int saved_errno = errno;			\
 			fi_log(prov, level, subsystem,			\
 				__func__, __LINE__, __VA_ARGS__);	\
+			errno = saved_errno;				\
+		}							\
 	} while (0)
 
 #define FI_WARN(prov, subsystem, ...)					\
@@ -103,9 +106,11 @@ void fi_log(const struct fi_provider *prov, enum fi_log_level level,
 #define FI_WARN_ONCE(prov, subsystem, ...) ({				\
 	static int warned;						\
 	if (!warned && fi_log_enabled(prov, FI_LOG_WARN, subsystem)) {	\
+		int saved_errno = errno;				\
 		fi_log(prov, FI_LOG_WARN, subsystem,			\
 			__func__, __LINE__, __VA_ARGS__);		\
 		warned = 1;						\
+		errno = saved_errno;					\
 	}								\
 })
 
diff --git a/deps/libfabric/include/unix/osd.h b/deps/libfabric/include/unix/osd.h
index c502ac845160e967d70500123b1bce7dcb8924bb..f8b02b87614b4f5a92837875e9abc14e869adc0a 100644
--- a/deps/libfabric/include/unix/osd.h
+++ b/deps/libfabric/include/unix/osd.h
@@ -296,6 +296,8 @@ OFI_DEF_COMPLEX_OPS(long_double)
 #ifdef HAVE_BUILTIN_ATOMICS
 #define ofi_atomic_add_and_fetch(radix, ptr, val) __sync_add_and_fetch((ptr), (val))
 #define ofi_atomic_sub_and_fetch(radix, ptr, val) __sync_sub_and_fetch((ptr), (val))
+#define ofi_atomic_cas_bool(radix, ptr, expected, desired) 	\
+	__sync_bool_compare_and_swap((ptr), (expected), (desired))
 #endif /* HAVE_BUILTIN_ATOMICS */
 
 int ofi_set_thread_affinity(const char *s);
diff --git a/deps/libfabric/include/windows/config.h b/deps/libfabric/include/windows/config.h
index 7e52a7b494297744536abe17432f5bfb2f740d87..912858778ef2f4b64f3964964676cfb85f41b11a 100644
--- a/deps/libfabric/include/windows/config.h
+++ b/deps/libfabric/include/windows/config.h
@@ -165,7 +165,7 @@
 #define PACKAGE_TARNAME PACKAGE
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.10.1"
+#define PACKAGE_VERSION "1.11.1"
 
 /* Define to the full name and version of this package. */
 #define PACKAGE_STRING PACKAGE_NAME " " PACKAGE_VERSION
diff --git a/deps/libfabric/include/windows/osd.h b/deps/libfabric/include/windows/osd.h
index d0594040838a5f6a9f9c2b08fc0a2862da9f7cba..d3cabcebb659df82185af455138ddb9a921dae3e 100644
--- a/deps/libfabric/include/windows/osd.h
+++ b/deps/libfabric/include/windows/osd.h
@@ -261,6 +261,7 @@ do						\
 #define strcasecmp _stricmp
 #define snprintf _snprintf
 #define sleep(x) Sleep(x * 1000)
+#define strtok_r strtok_s
 
 #define __PRI64_PREFIX "ll"
 
@@ -709,40 +710,53 @@ static inline SOCKET ofi_socket(int domain, int type, int protocol)
 	return socket(domain, type, protocol);
 }
 
-static inline ssize_t ofi_read_socket(SOCKET fd, void *buf, size_t count)
+/*
+ * The windows API limits socket send/recv transfers to INT_MAX.
+ * For nonblocking, stream sockets, we limit send/recv calls to that
+ * size, since the sockets aren't guaranteed to send the full amount
+ * requested.  For datagram sockets, we don't expect any transfers to
+ * be larger than a few KB.
+ * We do not handle blocking sockets that attempt to transfer more
+ * than INT_MAX data at a time.
+ */
+static inline ssize_t
+ofi_recv_socket(SOCKET fd, void *buf, size_t count, int flags)
 {
-	return recv(fd, (char *)buf, (int)count, 0);
+	int len = count > INT_MAX ? INT_MAX : (int) count;
+	return (ssize_t) recv(fd, (char *) buf, len, flags);
 }
 
-static inline ssize_t ofi_write_socket(SOCKET fd, const void *buf, size_t count)
+static inline ssize_t
+ofi_send_socket(SOCKET fd, const void *buf, size_t count, int flags)
 {
-	return send(fd, (const char*)buf, (int)count, 0);
+	int len = count > INT_MAX ? INT_MAX : (int) count;
+	return (ssize_t) send(fd, (const char*) buf, len, flags);
 }
 
-static inline ssize_t ofi_recv_socket(SOCKET fd, void *buf, size_t count,
-				      int flags)
+static inline ssize_t ofi_read_socket(SOCKET fd, void *buf, size_t count)
 {
-	return recv(fd, (char *)buf, (int)count, flags);
+	return ofi_recv_socket(fd, buf, count, 0);
 }
 
-static inline ssize_t
-ofi_recvfrom_socket(SOCKET fd, void *buf, size_t count, int flags,
-		    struct sockaddr *from, socklen_t *fromlen)
+static inline ssize_t ofi_write_socket(SOCKET fd, const void *buf, size_t count)
 {
-	return recvfrom(fd, (char*)buf, (int)count, flags, from, fromlen);
+	return ofi_send_socket(fd, buf, count, 0);
 }
 
-static inline ssize_t ofi_send_socket(SOCKET fd, const void *buf, size_t count,
-				      int flags)
+static inline ssize_t
+ofi_recvfrom_socket(SOCKET fd, void *buf, size_t count, int flags,
+		    struct sockaddr *from, socklen_t *fromlen)
 {
-	return send(fd, (const char*)buf, (int)count, flags);
+	int len = count > INT_MAX ? INT_MAX : (int) count;
+	return recvfrom(fd, (char*) buf, len, flags, from, (int *) fromlen);
 }
 
 static inline ssize_t
 ofi_sendto_socket(SOCKET fd, const void *buf, size_t count, int flags,
 		  const struct sockaddr *to, socklen_t tolen)
 {
-	return sendto(fd, (const char*)buf, (int)count, flags, to, tolen);
+	int len = count > INT_MAX ? INT_MAX : (int) count;
+	return sendto(fd, (const char*) buf, len, flags, to, (int) tolen);
 }
 
 ssize_t ofi_writev_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt);
@@ -987,11 +1001,15 @@ OFI_DEF_COMPLEX(long_double)
 /* atomics primitives */
 #ifdef HAVE_BUILTIN_ATOMICS
 #define InterlockedAdd32 InterlockedAdd
+#define InterlockedCompareExchange32 InterlockedCompareExchange
 typedef LONG ofi_atomic_int_32_t;
 typedef LONGLONG ofi_atomic_int_64_t;
 
 #define ofi_atomic_add_and_fetch(radix, ptr, val) InterlockedAdd##radix((ofi_atomic_int_##radix##_t *)(ptr), (ofi_atomic_int_##radix##_t)(val))
 #define ofi_atomic_sub_and_fetch(radix, ptr, val) InterlockedAdd##radix((ofi_atomic_int_##radix##_t *)(ptr), -(ofi_atomic_int_##radix##_t)(val))
+#define ofi_atomic_cas_bool(radix, ptr, expected, desired)					\
+	(InterlockedCompareExchange##radix(ptr, desired, expected) == expected)
+
 #endif /* HAVE_BUILTIN_ATOMICS */
 
 static inline int ofi_set_thread_affinity(const char *s)
diff --git a/deps/libfabric/include/windows/pthread.h b/deps/libfabric/include/windows/pthread.h
index 66f2cd1666d101b84116fb1bf3582a341f45e8ff..67e3f9cef2b3ab4147655b1e97e74963a0aede69 100644
--- a/deps/libfabric/include/windows/pthread.h
+++ b/deps/libfabric/include/windows/pthread.h
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2017 Intel Corporation. All rights reserved.
+* Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
@@ -38,8 +39,10 @@
 #include <errno.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <stdbool.h>
 
 #define PTHREAD_MUTEX_INITIALIZER {0}
+#define PTHREAD_RWLOCK_INITIALIZER {0}
 
 #define pthread_cond_signal WakeConditionVariable
 #define pthread_cond_broadcast WakeAllConditionVariable
@@ -154,6 +157,84 @@ typedef struct pthread_cleanup_t
 	pthread_cleanup_callback_t routine;
 	void *arg;
 } pthread_cleanup_t;
+
+/* Read-Write lock implementation */
+
+typedef struct {
+    SRWLOCK	lock; /* Windows Slim Reader Writer Lock */
+    bool	write_mode;
+} pthread_rwlock_t;
+typedef void pthread_rwlockattr_t;
+
+static inline int pthread_rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr)
+{
+	(void)attr;
+	if (rwlock) {
+		InitializeSRWLock(&(rwlock->lock));
+		rwlock->write_mode = false;
+		return 0;
+	}
+	return 1;
+}
+
+static inline int pthread_rwlock_destroy(pthread_rwlock_t *rwlock)
+{
+	/* No SRWLock cleanup function */
+	(void)rwlock;
+	return 0;
+}
+
+static inline int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock)
+{
+	if (rwlock) {
+		AcquireSRWLockShared(&(rwlock->lock));
+		return 0;
+	}
+	return 1;
+}
+
+static inline int pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock)
+{
+	if (rwlock && TryAcquireSRWLockShared(&(rwlock->lock))) {
+		return 0;
+	}
+	return 1;
+}
+
+static inline int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock)
+{
+	if (rwlock) {
+		AcquireSRWLockExclusive(&(rwlock->lock));
+		rwlock->write_mode = true;
+		return 0;
+	}
+	return 1;
+}
+
+static inline int pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock)
+{
+	if (rwlock && TryAcquireSRWLockExclusive(&(rwlock->lock))) {
+		rwlock->write_mode = true;
+		return 0;
+	}
+	return 1;
+}
+
+
+static inline int pthread_rwlock_unlock(pthread_rwlock_t *rwlock)
+{
+	if (rwlock) {
+		if (rwlock->write_mode) {
+			rwlock->write_mode = false;
+			ReleaseSRWLockExclusive(&(rwlock->lock));
+		} else {
+			ReleaseSRWLockShared(&(rwlock->lock));
+		}
+		return 0;
+	}
+	return 1;
+}
+
 #ifndef __cplusplus 
 #define pthread_cleanup_push(_rout, _arg)				\
 {									\
diff --git a/deps/libfabric/libfabric.spec.in b/deps/libfabric/libfabric.spec.in
index 71ca8fe55df59b3579fcc75d1ec7aa1ceb510098..f7680f651553d0b173065d04258bb30d641790e2 100644
--- a/deps/libfabric/libfabric.spec.in
+++ b/deps/libfabric/libfabric.spec.in
@@ -20,6 +20,9 @@ License: GPLv2 or BSD
 Url: http://www.github.com/ofiwg/libfabric
 Source: http://www.github.org/ofiwg/%{name}/releases/download/v{%version}/%{name}-%{version}.tar.bz2
 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
+%if 0%{?suse_version} >= 1
+Provides: libfabric1 = %{version}-%{release}
+%endif
 
 %description
 libfabric provides a user-space API to access high-performance fabric
diff --git a/deps/libfabric/libfabric.vcxproj b/deps/libfabric/libfabric.vcxproj
index 495702e3dbcc26e3066be8d9068763264de587c2..ce41ee23d252bf1564e89155b9fc0703fbdde038 100644
--- a/deps/libfabric/libfabric.vcxproj
+++ b/deps/libfabric/libfabric.vcxproj
@@ -568,6 +568,8 @@
     <ClCompile Include="prov\util\src\util_mem_monitor.c" />
     <ClCompile Include="prov\util\src\util_mem_hooks.c" />
     <ClCompile Include="prov\util\src\util_mr_cache.c" />
+    <ClCompile Include="prov\util\src\cuda_mem_monitor.c" />
+    <ClCompile Include="prov\util\src\rocr_mem_monitor.c" />
     <ClCompile Include="src\common.c" />
     <ClCompile Include="src\enosys.c">
       <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug-ICC|x64'">4127;869</DisableSpecificWarnings>
@@ -576,6 +578,10 @@
     <ClCompile Include="src\fabric.c" />
     <ClCompile Include="src\fasthash.c" />
     <ClCompile Include="src\fi_tostr.c" />
+    <ClCompile Include="src\hmem.c" />
+    <ClCompile Include="src\hmem_cuda.c" />
+    <ClCompile Include="src\hmem_rocr.c" />
+    <ClCompile Include="src\hmem_ze.c" />
     <ClCompile Include="src\indexer.c" />
     <ClCompile Include="src\iov.c" />
     <ClCompile Include="src\shared\ofi_str.c" />
@@ -593,6 +599,7 @@
     <ClInclude Include="include\ofi_abi.h" />
     <ClInclude Include="include\ofi_atom.h" />
     <ClInclude Include="include\ofi_atomic.h" />
+    <ClInclude Include="include\ofi_hmem.h" />
     <ClInclude Include="include\ofi_hook.h" />
     <ClInclude Include="include\ofi_mr.h" />
     <ClInclude Include="include\ofi_net.h" />
diff --git a/deps/libfabric/libfabric.vcxproj.filters b/deps/libfabric/libfabric.vcxproj.filters
index b47b33f2ccd7daf529c25e268878331f79186a8d..55f0e0440a08bb854ddb86f6d09f9150a2e68048 100644
--- a/deps/libfabric/libfabric.vcxproj.filters
+++ b/deps/libfabric/libfabric.vcxproj.filters
@@ -138,6 +138,15 @@
     <ClCompile Include="src\mem.c">
       <Filter>Source Files\src</Filter>
     </ClCompile>
+    <ClCompile Include="src\hmem.c">
+      <Filter>Source Files\src</Filter>
+    </ClCompile>
+    <ClCompile Include="src\hmem_cuda.c">
+      <Filter>Source Files\src</Filter>
+    </ClCompile>
+    <ClCompile Include="src\hmem_rocr.c">
+      <Filter>Source Files\src</Filter>
+    </ClCompile>
     <ClCompile Include="src\rbtree.c">
       <Filter>Source Files\src</Filter>
     </ClCompile>
@@ -186,6 +195,12 @@
     <ClCompile Include="prov\util\src\util_mr_cache.c">
       <Filter>Source Files\prov\util</Filter>
     </ClCompile>
+    <ClCompile Include="prov\util\src\cuda_mem_monitor.c">
+      <Filter>Source Files\prov\util</Filter>
+    </ClCompile>
+    <ClCompile Include="prov\util\src\rocr_mem_monitor.c">
+      <Filter>Source Files\prov\util</Filter>
+    </ClCompile>
     <ClCompile Include="src\windows\osd.c">
       <Filter>Source Files\src\windows</Filter>
     </ClCompile>
@@ -503,6 +518,9 @@
     <ClInclude Include="include\ofi_enosys.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="include\ofi_hmem.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="include\ofi_indexer.h">
       <Filter>Header Files</Filter>
     </ClInclude>
diff --git a/deps/libfabric/man/fabric.7.md b/deps/libfabric/man/fabric.7.md
index 6a16c3717cf82172132b3537a30fb808aef09a7a..29f76f17ed8b746ac7078a85af709a7bc11af4d0 100644
--- a/deps/libfabric/man/fabric.7.md
+++ b/deps/libfabric/man/fabric.7.md
@@ -297,6 +297,68 @@ portability across providers.
   fabric domain may not be available in a child process because of copy
   on write restrictions.
 
+# ABI CHANGES
+
+libfabric releases maintain compatibility with older releases, so that
+compiled applications can continue to work as-is, and previously written
+applications will compile against newer versions of the library without
+needing source code changes.  The changes below describe ABI updates
+that have occurred and which libfabric release corresponds to the
+changes.
+
+Note that because most functions called by applications actually call
+static inline functions, which in turn reference function pointers in
+order to call directly into providers, libfabric only exports a handful
+of functions directly.  ABI changes are limited to those functions,
+most notably the fi_getinfo call and its returned attribute structures.
+
+The ABI version is independent from the libfabric release version.
+
+## ABI 1.0
+
+The initial libfabric release (1.0.0) also corresponds to ABI version 1.0.
+The 1.0 ABI was unchanged for libfabric major.minor versions 1.0, 1.1, 1.2,
+1.3, and 1.4.
+
+## ABI 1.1
+
+A number of external data structures were appended starting with libfabric
+version 1.5.  These changes included adding the fields to the following
+data structures.  The 1.1 ABI was exported by libfabric versions 1.5 and
+1.6.
+
+*fi_fabric_attr*
+: Added api_version
+
+*fi_domain_attr*
+: Added cntr_cnt, mr_iov_limit, caps, mode, auth_key, auth_key_size,
+  max_err_data, and mr_cnt fields.  The mr_mode field was also changed
+  from an enum to an integer flag field.
+
+*fi_ep_attr*
+: Added auth_key_size and auth_key fields.
+
+## ABI 1.2
+
+The 1.2 ABI version was exported by libfabric versions 1.7 and 1.8, and
+expanded the following structure.
+
+*fi_info*
+: The fi_info structure was expanded to reference a new fabric object,
+  fid_nic.  When available, the fid_nic references a new set of attributes
+  related to network hardware details.
+
+## ABI 1.3
+
+The 1.3 ABI is also the current ABI version.  All libfabric releases
+starting at 1.9 export this ABI.
+
+*fi_domain_attr*
+: Added tclass
+
+*fi_tx_attr*
+: Added tclass
+
 # SEE ALSO
 
 [`fi_info`(1)](fi_info.1.html),
diff --git a/deps/libfabric/man/fi_domain.3.md b/deps/libfabric/man/fi_domain.3.md
index 3703aa8ea1a227a3df911f869edd59903b3cdfc7..aa77ec011f50836cc11ac81aca51ecf06edea7c7 100644
--- a/deps/libfabric/man/fi_domain.3.md
+++ b/deps/libfabric/man/fi_domain.3.md
@@ -26,6 +26,9 @@ int fi_domain_bind(struct fid_domain *domain, struct fid *eq,
 
 int fi_open_ops(struct fid *domain, const char *name, uint64_t flags,
     void **ops, void *context);
+
+int fi_set_ops(struct fid *domain, const char *name, uint64_t flags,
+    void *ops, void *context);
 ```
 
 # ARGUMENTS
@@ -74,6 +77,74 @@ interfaces may be used to access low-level resources and operations
 that are specific to the opened resource domain.  The details of
 domain interfaces are outside the scope of this documentation.
 
+## fi_set_ops
+
+fi_set_ops assigns callbacks that a provider should invoke in place
+of performing selected tasks. This allows users to modify or control
+a provider's default behavior. Conceptually, it allows the user to
+hook specific functions used by a provider and replace it with their
+own.
+
+The operations being modified are identified using a well-known
+character string, passed as the name parameter. The format of the
+ops parameter is dependent upon the name value. The ops parameter will
+reference a structure containing the callbacks and other fields needed
+by the provider to invoke the user's functions.
+
+If a provider accepts the override, it will return FI_SUCCESS. If the
+override is unknown or not supported, the provider will return
+-FI_ENOSYS. Overrides should be set prior to allocating resources on
+the domain.
+
+The following fi_set_ops operations and corresponding callback
+structures are defined.
+
+**FI_SET_OPS_HMEM_OVERRIDE -- Heterogeneous Memory Overrides**
+
+HMEM override allows users to override HMEM related operations a
+provider may perform. Currently, the scope of the HMEM override
+is to allow a user to define the memory movement functions a provider
+should use when accessing a user buffer. The user-defined memory
+movement functions need to account for all the different HMEM iface
+types a provider may encounter.
+
+All objects allocated against a domain will inherit this override.
+
+The following is the HMEM override operation name and structure.
+
+```c
+#define FI_SET_OPS_HMEM_OVERRIDE "hmem_override_ops"
+
+struct fi_hmem_override_ops {
+    size_t	size;
+
+    ssize_t (*copy_from_hmem_iov)(void *dest, size_t size,
+        enum fi_hmem_iface iface, uint64_t device, const struct iovec *hmem_iov,
+        size_t hmem_iov_count, uint64_t hmem_iov_offset);
+
+    ssize_t (*copy_to_hmem_iov)(enum fi_hmem_iface iface, uint64_t device,
+	const struct iovec *hmem_iov, size_t hmem_iov_count,
+        uint64_t hmem_iov_offset, const void *src, size_t size);
+};
+```
+
+All fields in struct fi_hmem_override_ops must be set (non-null) to a
+valid value.
+
+*size*
+: This should be set to the sizeof(struct fi_hmem_override_ops). The
+size field is used for forward and backward compatibility purposes.
+
+*copy_from_hmem_iov*
+: Copy data from the device/hmem to host memory. This function should
+return a negative fi_errno on error, or the number of bytes copied on
+success.
+
+*copy_to_hmem_iov*
+: Copy data from host memory to the device/hmem. This function should
+return a negative fi_errno on error, or the number of bytes copied on
+success.
+
 ## fi_domain_bind
 
 Associates an event queue with the domain.  An event queue bound to a
@@ -546,7 +617,11 @@ fixed value of the maximum number of endpoints supported by the
 underlying hardware, or may be a dynamic value, based on the default
 attributes of an allocated endpoint, such as the endpoint capabilities
 and size.  The endpoint count is the number of addressable endpoints
-supported by the provider.
+supported by the provider. Providers return capability limits based on
+configured hardware maximum capabilities. Providers cannot predict all
+possible system limitations without posteriori knowledge acquired during
+runtime that will further limit these hardware maximums (e.g. application
+memory consumption, FD usage, etc.).
 
 ## Transmit Context Count (tx_ctx_cnt)
 
diff --git a/deps/libfabric/man/fi_endpoint.3.md b/deps/libfabric/man/fi_endpoint.3.md
index c19c9e0c9e91f74560e860dcfe3855ab8c27e75a..61794d920ceddf3db3134f28963e285c5009c6f5 100644
--- a/deps/libfabric/man/fi_endpoint.3.md
+++ b/deps/libfabric/man/fi_endpoint.3.md
@@ -479,7 +479,7 @@ The following option levels and option names and parameters are defined.
 : Defines the maximum size of a buffered message that will be reported
   to users as part of a receive completion when the FI_BUFFERED_RECV mode
   is enabled on an endpoint.
-  
+
   fi_getopt() will return the currently configured threshold, or the
   provider's default threshold if one has not be set by the application.
   fi_setopt() allows an application to configure the threshold.  If the
@@ -693,7 +693,7 @@ The protocol version allows providers to extend an existing protocol,
 by adding support for additional features or functionality for example,
 in a backward compatible manner.  Providers that support different versions
 of the same protocol should inter-operate, but only when using the
-capabilities defined for the lesser version. 
+capabilities defined for the lesser version.
 
 ## max_msg_size - Max Message Size
 
@@ -830,7 +830,7 @@ details.
 ## auth_key_size - Authorization Key Length
 
 The length of the authorization key in bytes.  This field will be 0 if
-authorization keys are not available or used.  This field is ignored 
+authorization keys are not available or used.  This field is ignored
 unless the fabric is opened with API version 1.5 or greater.
 
 ## auth_key - Authorization Key
@@ -841,9 +841,9 @@ to limit communication between endpoints.  Only peer endpoints that are
 programmed to use the same authorization key may communicate.
 Authorization keys are often used to implement job keys, to ensure
 that processes running in different jobs do not accidentally
-cross traffic.  The domain authorization key will be used if auth_key_size 
+cross traffic.  The domain authorization key will be used if auth_key_size
 is set to 0.  This field is ignored unless the fabric is opened with API
-version 1.5 or greater. 
+version 1.5 or greater.
 
 # TRANSMIT CONTEXT ATTRIBUTES
 
@@ -875,7 +875,8 @@ capability bits from the fi_info structure will be used.
 
 The following capabilities apply to the transmit attributes: FI_MSG,
 FI_RMA, FI_TAGGED, FI_ATOMIC, FI_READ, FI_WRITE, FI_SEND, FI_HMEM,
-FI_TRIGGER, FI_FENCE, FI_MULTICAST, FI_RMA_PMEM, and FI_NAMED_RX_CTX.
+FI_TRIGGER, FI_FENCE, FI_MULTICAST, FI_RMA_PMEM, FI_NAMED_RX_CTX,
+and FI_COLLECTIVE.
 
 Many applications will be able to ignore this field and rely solely
 on the fi_info::caps field.  Use of this field provides fine grained
@@ -1171,7 +1172,8 @@ capability bits from the fi_info structure will be used.
 The following capabilities apply to the receive attributes: FI_MSG,
 FI_RMA, FI_TAGGED, FI_ATOMIC, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_RECV,
 FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, FI_VARIABLE_MSG,
-FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, and FI_SOURCE_ERR.
+FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, FI_SOURCE_ERR, and
+FI_COLLECTIVE.
 
 Many applications will be able to ignore this field and rely solely
 on the fi_info::caps field.  Use of this field provides fine grained
@@ -1359,7 +1361,7 @@ associated with completion queues or counters.  Completed receive
 operations are posted to the CQs bound to the endpoint.  An endpoint
 may only be associated with a single receive context, and all
 connectionless endpoints associated with a shared receive context must
-also share the same address vector. 
+also share the same address vector.
 
 Endpoints associated with a shared transmit context may use dedicated
 receive contexts, and vice-versa.  Or an endpoint may use shared
diff --git a/deps/libfabric/man/fi_fabric.3.md b/deps/libfabric/man/fi_fabric.3.md
index e81f281a368f979e4da06ab5f44115c7010878c5..a69b0e288f2ff5579249d3f1b26418469fb051b7 100644
--- a/deps/libfabric/man/fi_fabric.3.md
+++ b/deps/libfabric/man/fi_fabric.3.md
@@ -144,6 +144,9 @@ datatype or field value.
 *FI_TYPE_FID*
 : struct fid *
 
+*FI_TYPE_HMEM_IFACE*
+: enum fi_hmem_iface *
+
 fi_tostr() will return a pointer to an internal libfabric buffer that
 should not be modified, and will be overwritten the next time
 fi_tostr() is invoked.  fi_tostr() is not thread safe.
diff --git a/deps/libfabric/man/fi_getinfo.3.md b/deps/libfabric/man/fi_getinfo.3.md
index 2e5c8f995bccacac79dfbabc00140963ebfa2bd2..5159356655bd3d3b97101b4db76219f60693eeff 100644
--- a/deps/libfabric/man/fi_getinfo.3.md
+++ b/deps/libfabric/man/fi_getinfo.3.md
@@ -263,6 +263,11 @@ additional optimizations.
   FI_WRITE, FI_REMOTE_READ, and FI_REMOTE_WRITE flags to restrict the
   types of atomic operations supported by an endpoint.
 
+*FI_COLLECTIVE*
+: Requests support for collective operations.  Endpoints that support
+  this capability support the collective operations defined in
+  [`fi_collective`(3)](fi_collective.3.html).
+
 *FI_DIRECTED_RECV*
 : Requests that the communication endpoint use the source address of
   an incoming message when matching it with a receive buffer.  If this
@@ -279,7 +284,7 @@ additional optimizations.
 
 *FI_HMEM*
 : Specifies that the endpoint should support transfers to and from
-  device memory. 
+  device memory.
 
 *FI_LOCAL_COMM*
 : Indicates that the endpoint support host local communication.  This
@@ -440,7 +445,7 @@ may optionally report non-selected secondary capabilities if doing so
 would not compromise performance or security.
 
 Primary capabilities: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, FI_MULTICAST,
-FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_VARIABLE_MSG, FI_HMEM
+FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_VARIABLE_MSG, FI_HMEM, FI_COLLECTIVE
 
 Primary modifiers: FI_READ, FI_WRITE, FI_RECV, FI_SEND,
 FI_REMOTE_READ, FI_REMOTE_WRITE
diff --git a/deps/libfabric/man/fi_mr.3.md b/deps/libfabric/man/fi_mr.3.md
index f5dfef002786c72425e91b37634f78a3a10ec3b9..1197c0dd2751a7fd50e328ce58d247d8dab27f66 100644
--- a/deps/libfabric/man/fi_mr.3.md
+++ b/deps/libfabric/man/fi_mr.3.md
@@ -475,6 +475,7 @@ struct fi_mr_attr {
 	union {
 		uint64_t         reserved;
 		int              cuda;
+		int		 ze
 	} device;
 };
 ```
@@ -581,6 +582,13 @@ requested the FI_HMEM capability.
 : Uses Nvidia CUDA interfaces such as cuMemAlloc, cuMemAllocHost,
   cuMemAllocManaged, cuMemFree, cudaMalloc, cudaFree.
 
+*FI_HMEM_ROCR*
+: Uses AMD ROCR interfaces such as hsa_memory_allocate and hsa_memory_free.
+
+*FI_HMEM_ZE*
+: Uses Intel L0 ZE interfaces such as zeDriverAllocSharedMem,
+  zeDriverFreeMem. 
+
 ## device
 Reserved 64 bits for device identifier if using non-standard HMEM interface.
 This field is ignore unless the iface field is valid.
@@ -588,6 +596,9 @@ This field is ignore unless the iface field is valid.
 *cuda*
 : For FI_HMEM_CUDA, this is equivalent to CUdevice (int).
 
+*ze*
+: For FI_HMEM_ZE, this is equivalent to the ze_device_handle_t index (int).
+
 # NOTES
 
 Direct access to an application's memory by a remote peer requires that
@@ -663,6 +674,11 @@ are unable to manage their own network buffers.  A registration cache avoids
 the overhead of registering and unregistering a data buffer with each
 transfer.
 
+If a registration cache is going to be used for host and device memory, the
+device must support unified virtual addressing. If the device does not
+support unified virtual addressing, either an additional registration cache
+is required to track this device memory, or device memory cannot be cached.
+
 As a general rule, if hardware requires the FI_MR_LOCAL mode bit described
 above, but this is not supported by the application, a memory registration
 cache _may_ be in use.  The following environment variables may be used to
@@ -684,15 +700,29 @@ configure registration caches.
   zero will disable registration caching.
 
 *FI_MR_CACHE_MONITOR*
-: The cache monitor is responsible for detecting changes made between the
-  virtual addresses used by an application and the underlying physical pages.
-  Valid monitor options are: userfaultfd, memhooks, and disabled.  Selecting
-  disabled will turn off the registration cache.  Userfaultfd is a Linux
-  kernel feature used to report virtual to physical address mapping changes
-  to user space.  Memhooks operates by intercepting relevant memory
-  allocation and deallocation calls which may result in the mappings changing,
-  such as malloc, mmap, free, etc.  Note that memhooks operates at the elf
-  linker layer, and does not use glibc memory hooks.
+: The cache monitor is responsible for detecting system memory (FI_HMEM_SYSTEM)
+  changes made between the virtual addresses used by an application and the
+  underlying physical pages. Valid monitor options are: userfaultfd, memhooks,
+  and disabled.  Selecting disabled will turn off the registration cache.
+  Userfaultfd is a Linux kernel feature used to report virtual to physical
+  address mapping changes to user space. Memhooks operates by intercepting
+  relevant memory allocation and deallocation calls which may result in the
+  mappings changing, such as malloc, mmap, free, etc.  Note that memhooks
+  operates at the elf linker layer, and does not use glibc memory hooks.
+
+*FI_MR_CUDA_CACHE_MONITOR_ENABLED*
+: The CUDA cache monitor is responsible for detecting CUDA device memory
+  (FI_HMEM_CUDA) changes made between the device virtual addresses used by an
+  application and the underlying device physical pages. Valid monitor options
+  are: 0 or 1. Note that the CUDA memory monitor requires a CUDA toolkit version
+  with unified virtual addressing enabled.
+
+*FI_MR_ROCR_CACHE_MONITOR_ENABLED*
+: The ROCR cache monitor is responsible for detecting ROCR device memory
+  (FI_HMEM_ROCR) changes made between the device virtual addresses used by an
+  application and the underlying device physical pages. Valid monitor options
+  are: 0 or 1. Note that the ROCR memory monitor requires a ROCR version with
+  unified virtual addressing enabled.
 
 # SEE ALSO
 
diff --git a/deps/libfabric/man/fi_rxm.7.md b/deps/libfabric/man/fi_rxm.7.md
index 26194f40e65dc6fcb2e45a61718633bd024994df..35c86fde42b7a92129a0aab2426abdf5378fcdf1 100644
--- a/deps/libfabric/man/fi_rxm.7.md
+++ b/deps/libfabric/man/fi_rxm.7.md
@@ -137,8 +137,10 @@ The ofi_rxm provider checks for the following environment variables.
   via rendezvous protocol.
 
 *FI_OFI_RXM_USE_SRX*
-: Set this to 1 to use shared receive context from MSG provider. This reduces
-  overall memory usage but there may be a slight increase in latency (default: 0).
+: Set this to 1 to use shared receive context from MSG provider, or 0 to
+  disable using shared receive context. Shared receive contexts reduce overall
+  memory usage, but may increase in message latency.  If not set, verbs will
+  not use shared receive contexts by default, but the tcp provider will.
 
 *FI_OFI_RXM_TX_SIZE*
 : Defines default TX context size (default: 1024)
@@ -158,7 +160,7 @@ with (default: 256).
 
 *FI_OFI_RXM_CM_PROGRESS_INTERVAL*
 : Defines the duration of time in microseconds between calls to RxM CM progression
-  functions when using manual progress. Higher values may provide less noise for 
+  functions when using manual progress. Higher values may provide less noise for
   calls to fi_cq read functions, but may increase connection setup time (default: 10000)
 
 *FI_OFI_RXM_CQ_EQ_FAIRNESS*
diff --git a/deps/libfabric/man/man3/fi_domain.3 b/deps/libfabric/man/man3/fi_domain.3
index 873135abe81b502d0bd15c853806419d8236d9b7..77845c8b8a85d03560176897fa5b76f3a79c4a7c 100644
--- a/deps/libfabric/man/man3/fi_domain.3
+++ b/deps/libfabric/man/man3/fi_domain.3
@@ -1,7 +1,7 @@
 .\"t
 .\" Automatically generated by Pandoc 1.19.2.4
 .\"
-.TH "fi_domain" "3" "2020\-02\-07" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_domain" "3" "2020\-07\-30" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
 .hy
 .SH NAME
 .PP
@@ -24,6 +24,9 @@ int\ fi_domain_bind(struct\ fid_domain\ *domain,\ struct\ fid\ *eq,
 
 int\ fi_open_ops(struct\ fid\ *domain,\ const\ char\ *name,\ uint64_t\ flags,
 \ \ \ \ void\ **ops,\ void\ *context);
+
+int\ fi_set_ops(struct\ fid\ *domain,\ const\ char\ *name,\ uint64_t\ flags,
+\ \ \ \ void\ *ops,\ void\ *context);
 \f[]
 .fi
 .SH ARGUMENTS
@@ -85,6 +88,84 @@ Provider interfaces may be used to access low\-level resources and
 operations that are specific to the opened resource domain.
 The details of domain interfaces are outside the scope of this
 documentation.
+.SS fi_set_ops
+.PP
+fi_set_ops assigns callbacks that a provider should invoke in place of
+performing selected tasks.
+This allows users to modify or control a provider\[aq]s default
+behavior.
+Conceptually, it allows the user to hook specific functions used by a
+provider and replace it with their own.
+.PP
+The operations being modified are identified using a well\-known
+character string, passed as the name parameter.
+The format of the ops parameter is dependent upon the name value.
+The ops parameter will reference a structure containing the callbacks
+and other fields needed by the provider to invoke the user\[aq]s
+functions.
+.PP
+If a provider accepts the override, it will return FI_SUCCESS.
+If the override is unknown or not supported, the provider will return
+\-FI_ENOSYS.
+Overrides should be set prior to allocating resources on the domain.
+.PP
+The following fi_set_ops operations and corresponding callback
+structures are defined.
+.PP
+\f[B]FI_SET_OPS_HMEM_OVERRIDE \-\- Heterogeneous Memory Overrides\f[]
+.PP
+HMEM override allows users to override HMEM related operations a
+provider may perform.
+Currently, the scope of the HMEM override is to allow a user to define
+the memory movement functions a provider should use when accessing a
+user buffer.
+The user\-defined memory movement functions need to account for all the
+different HMEM iface types a provider may encounter.
+.PP
+All objects allocated against a domain will inherit this override.
+.PP
+The following is the HMEM override operation name and structure.
+.IP
+.nf
+\f[C]
+#define\ FI_SET_OPS_HMEM_OVERRIDE\ "hmem_override_ops"
+
+struct\ fi_hmem_override_ops\ {
+\ \ \ \ size_t\ \ size;
+
+\ \ \ \ ssize_t\ (*copy_from_hmem_iov)(void\ *dest,\ size_t\ size,
+\ \ \ \ \ \ \ \ enum\ fi_hmem_iface\ iface,\ uint64_t\ device,\ const\ struct\ iovec\ *hmem_iov,
+\ \ \ \ \ \ \ \ size_t\ hmem_iov_count,\ uint64_t\ hmem_iov_offset);
+
+\ \ \ \ ssize_t\ (*copy_to_hmem_iov)(enum\ fi_hmem_iface\ iface,\ uint64_t\ device,
+\ \ \ \ const\ struct\ iovec\ *hmem_iov,\ size_t\ hmem_iov_count,
+\ \ \ \ \ \ \ \ uint64_t\ hmem_iov_offset,\ const\ void\ *src,\ size_t\ size);
+};
+\f[]
+.fi
+.PP
+All fields in struct fi_hmem_override_ops must be set (non\-null) to a
+valid value.
+.TP
+.B \f[I]size\f[]
+This should be set to the sizeof(struct fi_hmem_override_ops).
+The size field is used for forward and backward compatibility purposes.
+.RS
+.RE
+.TP
+.B \f[I]copy_from_hmem_iov\f[]
+Copy data from the device/hmem to host memory.
+This function should return a negative fi_errno on error, or the number
+of bytes copied on success.
+.RS
+.RE
+.TP
+.B \f[I]copy_to_hmem_iov\f[]
+Copy data from host memory to the device/hmem.
+This function should return a negative fi_errno on error, or the number
+of bytes copied on success.
+.RS
+.RE
 .SS fi_domain_bind
 .PP
 Associates an event queue with the domain.
@@ -757,6 +838,12 @@ on the default attributes of an allocated endpoint, such as the endpoint
 capabilities and size.
 The endpoint count is the number of addressable endpoints supported by
 the provider.
+Providers return capability limits based on configured hardware maximum
+capabilities.
+Providers cannot predict all possible system limitations without
+posteriori knowledge acquired during runtime that will further limit
+these hardware maximums (e.g.
+application memory consumption, FD usage, etc.).
 .SS Transmit Context Count (tx_ctx_cnt)
 .PP
 The number of outbound command queues optimally supported by the
diff --git a/deps/libfabric/man/man3/fi_endpoint.3 b/deps/libfabric/man/man3/fi_endpoint.3
index d2b7722f8cf7151a21ce83d99f6a80af5411598b..1439db87f54b00109aef9e7a6eb1d6e5d31af80f 100644
--- a/deps/libfabric/man/man3/fi_endpoint.3
+++ b/deps/libfabric/man/man3/fi_endpoint.3
@@ -1,6 +1,6 @@
 .\" Automatically generated by Pandoc 1.19.2.4
 .\"
-.TH "fi_endpoint" "3" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_endpoint" "3" "2020\-08\-07" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
 .hy
 .SH NAME
 .PP
@@ -1088,7 +1088,8 @@ capability bits from the fi_info structure will be used.
 .PP
 The following capabilities apply to the transmit attributes: FI_MSG,
 FI_RMA, FI_TAGGED, FI_ATOMIC, FI_READ, FI_WRITE, FI_SEND, FI_HMEM,
-FI_TRIGGER, FI_FENCE, FI_MULTICAST, FI_RMA_PMEM, and FI_NAMED_RX_CTX.
+FI_TRIGGER, FI_FENCE, FI_MULTICAST, FI_RMA_PMEM, FI_NAMED_RX_CTX, and
+FI_COLLECTIVE.
 .PP
 Many applications will be able to ignore this field and rely solely on
 the fi_info::caps field.
@@ -1460,7 +1461,8 @@ capability bits from the fi_info structure will be used.
 The following capabilities apply to the receive attributes: FI_MSG,
 FI_RMA, FI_TAGGED, FI_ATOMIC, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_RECV,
 FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, FI_VARIABLE_MSG,
-FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, and FI_SOURCE_ERR.
+FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, FI_SOURCE_ERR, and
+FI_COLLECTIVE.
 .PP
 Many applications will be able to ignore this field and rely solely on
 the fi_info::caps field.
diff --git a/deps/libfabric/man/man3/fi_fabric.3 b/deps/libfabric/man/man3/fi_fabric.3
index d8ce12600a1e0868d4d5b88574c57e8af302e0d3..02ca38954d4b9caa420d0aa660edf50cb5aceae0 100644
--- a/deps/libfabric/man/man3/fi_fabric.3
+++ b/deps/libfabric/man/man3/fi_fabric.3
@@ -1,6 +1,6 @@
 .\" Automatically generated by Pandoc 1.19.2.4
 .\"
-.TH "fi_fabric" "3" "2020\-01\-07" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_fabric" "3" "2020\-06\-02" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
 .hy
 .SH NAME
 .PP
@@ -197,6 +197,11 @@ enum fi_op_type
 struct fid *
 .RS
 .RE
+.TP
+.B \f[I]FI_TYPE_HMEM_IFACE\f[]
+enum fi_hmem_iface *
+.RS
+.RE
 .PP
 fi_tostr() will return a pointer to an internal libfabric buffer that
 should not be modified, and will be overwritten the next time fi_tostr()
diff --git a/deps/libfabric/man/man3/fi_getinfo.3 b/deps/libfabric/man/man3/fi_getinfo.3
index 2147abf81d9d5187f3b5bfed49e987dc4f36d25b..7a7b24c522574a69f4803e818a0415896d2efb47 100644
--- a/deps/libfabric/man/man3/fi_getinfo.3
+++ b/deps/libfabric/man/man3/fi_getinfo.3
@@ -1,6 +1,6 @@
 .\" Automatically generated by Pandoc 1.19.2.4
 .\"
-.TH "fi_getinfo" "3" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_getinfo" "3" "2020\-08\-07" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
 .hy
 .SH NAME
 .PP
@@ -337,6 +337,13 @@ supported by an endpoint.
 .RS
 .RE
 .TP
+.B \f[I]FI_COLLECTIVE\f[]
+Requests support for collective operations.
+Endpoints that support this capability support the collective operations
+defined in \f[C]fi_collective\f[](3).
+.RS
+.RE
+.TP
 .B \f[I]FI_DIRECTED_RECV\f[]
 Requests that the communication endpoint use the source address of an
 incoming message when matching it with a receive buffer.
@@ -580,7 +587,7 @@ doing so would not compromise performance or security.
 .PP
 Primary capabilities: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC,
 FI_MULTICAST, FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_VARIABLE_MSG,
-FI_HMEM
+FI_HMEM, FI_COLLECTIVE
 .PP
 Primary modifiers: FI_READ, FI_WRITE, FI_RECV, FI_SEND, FI_REMOTE_READ,
 FI_REMOTE_WRITE
diff --git a/deps/libfabric/man/man3/fi_mr.3 b/deps/libfabric/man/man3/fi_mr.3
index aab59e0722f98a7e8afe80212cc7c4527c9b46d2..afcc664dd1a4805922202c3e174edc17084ef7c6 100644
--- a/deps/libfabric/man/man3/fi_mr.3
+++ b/deps/libfabric/man/man3/fi_mr.3
@@ -1,6 +1,6 @@
 .\" Automatically generated by Pandoc 1.19.2.4
 .\"
-.TH "fi_mr" "3" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_mr" "3" "2020\-08\-11" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
 .hy
 .SH NAME
 .PP
@@ -577,6 +577,7 @@ struct\ fi_mr_attr\ {
 \ \ \ \ union\ {
 \ \ \ \ \ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ reserved;
 \ \ \ \ \ \ \ \ int\ \ \ \ \ \ \ \ \ \ \ \ \ \ cuda;
+\ \ \ \ \ \ \ \ int\ \ \ \ \ \ ze
 \ \ \ \ }\ device;
 };
 \f[]
@@ -702,6 +703,18 @@ Uses Nvidia CUDA interfaces such as cuMemAlloc, cuMemAllocHost,
 cuMemAllocManaged, cuMemFree, cudaMalloc, cudaFree.
 .RS
 .RE
+.TP
+.B \f[I]FI_HMEM_ROCR\f[]
+Uses AMD ROCR interfaces such as hsa_memory_allocate and
+hsa_memory_free.
+.RS
+.RE
+.TP
+.B \f[I]FI_HMEM_ZE\f[]
+Uses Intel L0 ZE interfaces such as zeDriverAllocSharedMem,
+zeDriverFreeMem.
+.RS
+.RE
 .SS device
 .PP
 Reserved 64 bits for device identifier if using non\-standard HMEM
@@ -712,6 +725,12 @@ This field is ignore unless the iface field is valid.
 For FI_HMEM_CUDA, this is equivalent to CUdevice (int).
 .RS
 .RE
+.TP
+.B \f[I]ze\f[]
+For FI_HMEM_ZE, this is equivalent to the ze_device_handle_t index
+(int).
+.RS
+.RE
 .SH NOTES
 .PP
 Direct access to an application\[aq]s memory by a remote peer requires
@@ -798,6 +817,12 @@ their own network buffers.
 A registration cache avoids the overhead of registering and
 unregistering a data buffer with each transfer.
 .PP
+If a registration cache is going to be used for host and device memory,
+the device must support unified virtual addressing.
+If the device does not support unified virtual addressing, either an
+additional registration cache is required to track this device memory,
+or device memory cannot be cached.
+.PP
 As a general rule, if hardware requires the FI_MR_LOCAL mode bit
 described above, but this is not supported by the application, a memory
 registration cache \f[I]may\f[] be in use.
@@ -827,9 +852,9 @@ Setting this to zero will disable registration caching.
 .RE
 .TP
 .B \f[I]FI_MR_CACHE_MONITOR\f[]
-The cache monitor is responsible for detecting changes made between the
-virtual addresses used by an application and the underlying physical
-pages.
+The cache monitor is responsible for detecting system memory
+(FI_HMEM_SYSTEM) changes made between the virtual addresses used by an
+application and the underlying physical pages.
 Valid monitor options are: userfaultfd, memhooks, and disabled.
 Selecting disabled will turn off the registration cache.
 Userfaultfd is a Linux kernel feature used to report virtual to physical
@@ -841,6 +866,26 @@ Note that memhooks operates at the elf linker layer, and does not use
 glibc memory hooks.
 .RS
 .RE
+.TP
+.B \f[I]FI_MR_CUDA_CACHE_MONITOR_ENABLED\f[]
+The CUDA cache monitor is responsible for detecting CUDA device memory
+(FI_HMEM_CUDA) changes made between the device virtual addresses used by
+an application and the underlying device physical pages.
+Valid monitor options are: 0 or 1.
+Note that the CUDA memory monitor requires a CUDA toolkit version with
+unified virtual addressing enabled.
+.RS
+.RE
+.TP
+.B \f[I]FI_MR_ROCR_CACHE_MONITOR_ENABLED\f[]
+The ROCR cache monitor is responsible for detecting ROCR device memory
+(FI_HMEM_ROCR) changes made between the device virtual addresses used by
+an application and the underlying device physical pages.
+Valid monitor options are: 0 or 1.
+Note that the ROCR memory monitor requires a ROCR version with unified
+virtual addressing enabled.
+.RS
+.RE
 .SH SEE ALSO
 .PP
 \f[C]fi_getinfo\f[](3), \f[C]fi_endpoint\f[](3), \f[C]fi_domain\f[](3),
diff --git a/deps/libfabric/man/man7/fabric.7 b/deps/libfabric/man/man7/fabric.7
index f144062725567277c56deef460eb9ef0176526e8..c4b812bfc72a3ff051607ea7cd0e0a4f89b6148a 100644
--- a/deps/libfabric/man/man7/fabric.7
+++ b/deps/libfabric/man/man7/fabric.7
@@ -1,6 +1,6 @@
 .\" Automatically generated by Pandoc 1.19.2.4
 .\"
-.TH "fabric" "7" "2019\-05\-04" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fabric" "7" "2020\-07\-21" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
 .hy
 .SH NAME
 .PP
@@ -383,6 +383,80 @@ may not be available in a child process because of copy on write
 restrictions.
 .RS
 .RE
+.SH ABI CHANGES
+.PP
+libfabric releases maintain compatibility with older releases, so that
+compiled applications can continue to work as\-is, and previously
+written applications will compile against newer versions of the library
+without needing source code changes.
+The changes below describe ABI updates that have occurred and which
+libfabric release corresponds to the changes.
+.PP
+Note that because most functions called by applications actually call
+static inline functions, which in turn reference function pointers in
+order to call directly into providers, libfabric only exports a handful
+of functions directly.
+ABI changes are limited to those functions, most notably the fi_getinfo
+call and its returned attribute structures.
+.PP
+The ABI version is independent from the libfabric release version.
+.SS ABI 1.0
+.PP
+The initial libfabric release (1.0.0) also corresponds to ABI version
+1.0.
+The 1.0 ABI was unchanged for libfabric major.minor versions 1.0, 1.1,
+1.2, 1.3, and 1.4.
+.SS ABI 1.1
+.PP
+A number of external data structures were appended starting with
+libfabric version 1.5.
+These changes included adding the fields to the following data
+structures.
+The 1.1 ABI was exported by libfabric versions 1.5 and 1.6.
+.TP
+.B \f[I]fi_fabric_attr\f[]
+Added api_version
+.RS
+.RE
+.TP
+.B \f[I]fi_domain_attr\f[]
+Added cntr_cnt, mr_iov_limit, caps, mode, auth_key, auth_key_size,
+max_err_data, and mr_cnt fields.
+The mr_mode field was also changed from an enum to an integer flag
+field.
+.RS
+.RE
+.TP
+.B \f[I]fi_ep_attr\f[]
+Added auth_key_size and auth_key fields.
+.RS
+.RE
+.SS ABI 1.2
+.PP
+The 1.2 ABI version was exported by libfabric versions 1.7 and 1.8, and
+expanded the following structure.
+.TP
+.B \f[I]fi_info\f[]
+The fi_info structure was expanded to reference a new fabric object,
+fid_nic.
+When available, the fid_nic references a new set of attributes related
+to network hardware details.
+.RS
+.RE
+.SS ABI 1.3
+.PP
+The 1.3 ABI is also the current ABI version.
+All libfabric releases starting at 1.9 export this ABI.
+.TP
+.B \f[I]fi_domain_attr\f[]
+Added tclass
+.RS
+.RE
+.TP
+.B \f[I]fi_tx_attr\f[]
+Added tclass
+.RS
+.RE
 .SH SEE ALSO
 .PP
 \f[C]fi_info\f[](1), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3),
diff --git a/deps/libfabric/man/man7/fi_rxm.7 b/deps/libfabric/man/man7/fi_rxm.7
index 0980aa54a95fbeeabd83db28280eb5cf58ef76b3..3883ac2fdc0598e002951e60a36774ac23af9dab 100644
--- a/deps/libfabric/man/man7/fi_rxm.7
+++ b/deps/libfabric/man/man7/fi_rxm.7
@@ -1,6 +1,6 @@
 .\" Automatically generated by Pandoc 1.19.2.4
 .\"
-.TH "fi_rxm" "7" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
+.TH "fi_rxm" "7" "2020\-06\-06" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@"
 .hy
 .SH NAME
 .PP
@@ -157,9 +157,12 @@ transmitted via rendezvous protocol.
 .RE
 .TP
 .B \f[I]FI_OFI_RXM_USE_SRX\f[]
-Set this to 1 to use shared receive context from MSG provider.
-This reduces overall memory usage but there may be a slight increase in
-latency (default: 0).
+Set this to 1 to use shared receive context from MSG provider, or 0 to
+disable using shared receive context.
+Shared receive contexts reduce overall memory usage, but may increase in
+message latency.
+If not set, verbs will not use shared receive contexts by default, but
+the tcp provider will.
 .RS
 .RE
 .TP
diff --git a/deps/libfabric/prov/efa/Makefile.include b/deps/libfabric/prov/efa/Makefile.include
index 14e3171463a3e34d1af587c8afe4d50e03358692..a4d027f0e77cb4ad98e4fdc13f7886645b28803a 100644
--- a/deps/libfabric/prov/efa/Makefile.include
+++ b/deps/libfabric/prov/efa/Makefile.include
@@ -60,7 +60,6 @@ _efa_files = \
 
 _efa_headers = \
 	prov/efa/src/efa.h \
-	prov/efa/src/rxr/efa_cuda.h \
 	prov/efa/src/rxr/rxr.h \
 	prov/efa/src/rxr/rxr_cntr.h \
 	prov/efa/src/rxr/rxr_rma.h \
diff --git a/deps/libfabric/prov/efa/src/efa.h b/deps/libfabric/prov/efa/src/efa.h
index 29a487482574d30a26259d7cbf94d0ee88fe253c..e41cd38157adf527f903944449fdb59604d226e6 100644
--- a/deps/libfabric/prov/efa/src/efa.h
+++ b/deps/libfabric/prov/efa/src/efa.h
@@ -48,7 +48,6 @@
 #include <assert.h>
 #include <pthread.h>
 #include <sys/epoll.h>
-#include <uthash.h>
 
 #include <rdma/fabric.h>
 #include <rdma/fi_cm.h>
@@ -152,7 +151,7 @@ struct efa_domain {
 	struct ibv_pd		*ibv_pd;
 	struct fi_info		*info;
 	struct efa_fabric	*fab;
-	struct ofi_mr_cache	cache;
+	struct ofi_mr_cache	*cache;
 	struct efa_qp		**qp_table;
 	size_t			qp_table_sz_m1;
 };
@@ -194,6 +193,7 @@ struct efa_cq {
 
 struct efa_context {
 	struct ibv_context	*ibv_ctx;
+	int			dev_idx;
 	uint64_t		max_mr_size;
 	uint16_t		inline_buf_size;
 	uint16_t		max_wr_rdma_sge;
@@ -201,6 +201,11 @@ struct efa_context {
 	uint32_t		device_caps;
 };
 
+struct efa_pd {
+	struct ibv_pd	   *ibv_pd;
+	int		   use_cnt;
+};
+
 struct efa_qp {
 	struct ibv_qp	*ibv_qp;
 	struct ibv_qp_ex *ibv_qp_ex;
@@ -246,6 +251,7 @@ struct efa_ep {
 	struct ibv_recv_wr	*recv_more_wr_tail;
 	struct ofi_bufpool	*send_wr_pool;
 	struct ofi_bufpool	*recv_wr_pool;
+	struct ibv_ah		*self_ah;
 };
 
 struct efa_send_wr {
@@ -273,7 +279,6 @@ struct efa_av {
 	enum fi_av_type		type;
 	efa_addr_to_conn_func	addr_to_conn;
 	struct efa_reverse_av	*reverse_av;
-	struct efa_av_entry     *av_map;
 	struct util_av		util_av;
 	enum fi_ep_type         ep_type;
 	/* Used only for FI_AV_TABLE */
@@ -285,7 +290,6 @@ struct efa_av_entry {
 	fi_addr_t		rdm_addr;
 	fi_addr_t		shm_rdm_addr;
 	bool			local_mapping;
-	UT_hash_handle		hh;
 };
 
 struct efa_ah_qpn {
@@ -334,6 +338,13 @@ extern struct fi_ops_cm efa_ep_cm_ops;
 extern struct fi_ops_msg efa_ep_msg_ops;
 extern struct fi_ops_rma efa_ep_rma_ops;
 
+ssize_t efa_rma_post_read(struct efa_ep *ep, const struct fi_msg_rma *msg,
+			  uint64_t flags, bool self_comm);
+
+extern fastlock_t pd_list_lock;
+// This list has the same indicies as ctx_list.
+extern struct efa_pd *pd_list;
+
 int efa_device_init(void);
 void efa_device_free(void);
 
@@ -367,6 +378,8 @@ ssize_t efa_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, fi_addr_
 
 ssize_t efa_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, uint64_t flags);
 
+bool efa_device_support_rdma_read(void);
+
 static inline
 bool efa_ep_support_rdma_read(struct fid_ep *ep_fid)
 {
@@ -406,4 +419,59 @@ size_t efa_max_rdma_size(struct fid_ep *ep_fid)
 	return efa_ep->domain->ctx->max_rdma_size;
 }
 
+static inline
+struct rxr_peer *efa_ep_get_peer(struct dlist_entry *ep_list_entry,
+				 fi_addr_t addr)
+{
+	struct util_ep *util_ep;
+	struct rxr_ep *rxr_ep;
+
+	util_ep = container_of(ep_list_entry, struct util_ep,
+			       av_entry);
+	rxr_ep = container_of(util_ep, struct rxr_ep, util_ep);
+	return rxr_ep_get_peer(rxr_ep, addr);
+}
+
+static inline
+int efa_peer_in_use(struct rxr_peer *peer)
+{
+	struct rxr_pkt_entry *pending_pkt;
+
+	if ((peer->tx_pending) || (peer->flags & RXR_PEER_IN_BACKOFF))
+		return -FI_EBUSY;
+	if (peer->rx_init) {
+		pending_pkt = *ofi_recvwin_peek(peer->robuf);
+		if (pending_pkt && pending_pkt->pkt)
+			return -FI_EBUSY;
+	}
+	return 0;
+}
+static inline
+void efa_free_robuf(struct rxr_peer *peer)
+{
+	ofi_recvwin_free(peer->robuf);
+	ofi_buf_free(peer->robuf);
+}
+
+static inline
+void efa_peer_reset(struct rxr_peer *peer)
+{
+	efa_free_robuf(peer);
+#ifdef ENABLE_EFA_POISONING
+	rxr_poison_mem_region((uint32_t *)peer, sizeof(struct rxr_peer));
+#endif
+	memset(peer, 0, sizeof(struct rxr_peer));
+	dlist_init(&peer->rnr_entry);
+}
+
+static inline bool efa_ep_is_cuda_mr(struct efa_mr *efa_mr)
+{
+	return efa_mr ? (efa_mr->peer.iface == FI_HMEM_CUDA): false;
+}
+
+static inline bool efa_is_cache_available(struct efa_domain *efa_domain)
+{
+	return efa_domain->cache;
+}
+
 #endif /* EFA_H */
diff --git a/deps/libfabric/prov/efa/src/efa_av.c b/deps/libfabric/prov/efa/src/efa_av.c
index 4fe7ce3548227f9e4537819e1cf939afa51ec1f2..f1b821067b06215d0f66499e77e58eee8a1eb44d 100644
--- a/deps/libfabric/prov/efa/src/efa_av.c
+++ b/deps/libfabric/prov/efa/src/efa_av.c
@@ -124,6 +124,24 @@ static size_t efa_av_tbl_find_first_empty(struct efa_av *av, size_t hint)
 	return -1;
 }
 
+static int efa_peer_resize(struct rxr_ep *ep, size_t current_count,
+			   size_t new_count)
+{
+	void *p = realloc(&ep->peer[0], (new_count * sizeof(struct rxr_peer)));
+
+	if (p)
+		ep->peer = p;
+	else
+		return -FI_ENOMEM;
+#ifdef ENABLE_EFA_POISONING
+	rxr_poison_mem_region((uint32_t *)&ep->peer[current_count], (new_count -
+			      current_count) * sizeof(struct rxr_peer));
+#endif
+	memset(&ep->peer[current_count], 0,
+		(new_count - current_count) * sizeof(struct rxr_peer));
+	return 0;
+}
+
 static int efa_av_resize(struct efa_av *av, size_t new_av_count)
 {
 	if (av->type == FI_AV_TABLE) {
@@ -136,6 +154,12 @@ static int efa_av_resize(struct efa_av *av, size_t new_av_count)
 		else
 			return -FI_ENOMEM;
 
+#ifdef ENABLE_EFA_POISONING
+	rxr_poison_mem_region((uint32_t *)av->conn_table + av->util_av.count,
+			      (new_av_count - av->util_av.count) *
+			      sizeof(*av->conn_table));
+#endif
+
 		memset(av->conn_table + av->util_av.count, 0,
 		       (new_av_count - av->util_av.count) * sizeof(*av->conn_table));
 	}
@@ -197,11 +221,13 @@ static int efa_av_insert_ah(struct efa_av *av, struct efa_ep_addr *addr,
 
 		break;
 	case FI_AV_TABLE:
-		av->next = efa_av_tbl_find_first_empty(av, av->next);
-		assert(av->next != -1);
-		*fi_addr = av->next;
+		if (av->ep_type == FI_EP_DGRAM) {
+			av->next = efa_av_tbl_find_first_empty(av, av->next);
+			assert(av->next != -1);
+			*fi_addr = av->next;
+		}
 
-		av->conn_table[av->next] = conn;
+		av->conn_table[*fi_addr] = conn;
 		av->next++;
 		break;
 	default:
@@ -259,41 +285,51 @@ int efa_av_insert_addr(struct efa_av *av, struct efa_ep_addr *addr,
 			   void *context)
 {
 	struct efa_av_entry *av_entry;
+	struct util_av_entry *util_av_entry;
 	int ret = 0;
 	struct rxr_peer *peer;
 	struct rxr_ep *rxr_ep;
 	struct util_ep *util_ep;
 	struct dlist_entry *ep_list_entry;
 	fi_addr_t shm_fiaddr;
-	char smr_name[RXR_MAX_NAME_LENGTH];
+	char smr_name[NAME_MAX];
 
 	fastlock_acquire(&av->util_av.lock);
+	ret = ofi_av_insert_addr(&av->util_av, addr, fi_addr);
 
-	HASH_FIND(hh, av->av_map, addr, EFA_EP_ADDR_LEN, av_entry);
-	if (av_entry) {
-		*fi_addr = av_entry->rdm_addr;
-		goto find_out;
-	}
-	if (av->used + 1 > av->util_av.count) {
-		ret = efa_av_resize(av, av->used + 1);
-		if (ret)
-			goto out;
-	}
-	ret = efa_av_insert_ah(av, addr, fi_addr,
-				flags, context);
 	if (ret) {
 		EFA_WARN(FI_LOG_AV, "Error in inserting address: %s\n",
 			 fi_strerror(ret));
 		goto out;
 	}
-	av_entry = calloc(1, sizeof(*av_entry));
-	if (OFI_UNLIKELY(!av_entry)) {
-		ret = -FI_ENOMEM;
-		EFA_WARN(FI_LOG_AV, "Failed to allocate memory for av_entry\n");
-		goto out;
-	}
-	memcpy((void *)&av_entry->ep_addr, addr, EFA_EP_ADDR_LEN);
+	util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool,
+					     *fi_addr);
+	/*
+	 * If the entry already exists then calling ofi_av_insert_addr would
+	 * increase the use_cnt by 1. For a new entry use_cnt will be 1, whereas
+	 * for a duplicate entry, use_cnt will be more that 1.
+	 */
+	if (ofi_atomic_get32(&util_av_entry->use_cnt) > 1)
+		goto find_out;
+
+	av_entry = (struct efa_av_entry *)util_av_entry->data;
 	av_entry->rdm_addr = *fi_addr;
+	av_entry->local_mapping = 0;
+
+	if (av->used + 1 > av->util_av.count) {
+		ret = efa_av_resize(av, av->util_av.count * 2);
+		if (ret)
+			goto out;
+		dlist_foreach(&av->util_av.ep_list, ep_list_entry) {
+			util_ep = container_of(ep_list_entry, struct util_ep,
+					       av_entry);
+			rxr_ep = container_of(util_ep, struct rxr_ep, util_ep);
+			ret = efa_peer_resize(rxr_ep, av->used,
+					      av->util_av.count);
+			if (ret)
+				goto out;
+		}
+	}
 
 	/*
 	 * Walk through all the EPs that bound to the AV,
@@ -356,8 +392,13 @@ int efa_av_insert_addr(struct efa_av *av, struct efa_ep_addr *addr,
 			}
 		}
 	}
-	HASH_ADD(hh, av->av_map, ep_addr,
-			EFA_EP_ADDR_LEN, av_entry);
+	ret = efa_av_insert_ah(av, addr, fi_addr,
+			       flags, context);
+	if (ret) {
+		EFA_WARN(FI_LOG_AV, "Error in inserting address: %s\n",
+			 fi_strerror(ret));
+		goto err_free_av_entry;
+	}
 
 find_out:
 	EFA_INFO(FI_LOG_AV,
@@ -365,7 +406,7 @@ find_out:
 			*(uint64_t *)addr, *fi_addr);
 	goto out;
 err_free_av_entry:
-	free(av_entry);
+	ofi_ibuf_free(util_av_entry);
 out:
 	fastlock_release(&av->util_av.lock);
 	return ret;
@@ -390,15 +431,6 @@ int efa_av_insert(struct fid_av *av_fid, const void *addr,
 		return -FI_ENOSYS;
 
 	if (av->ep_type == FI_EP_RDM) {
-		if (av->used + count > av->util_av.count) {
-			EFA_WARN(FI_LOG_AV,
-				"AV insert failed. Expect inserting %zu AV entries, but only %zu available\n",
-				count, av->util_av.count - av->used);
-			if (av->util_av.eq)
-				ofi_av_write_event(&av->util_av, i, FI_ENOMEM,
-					context);
-			goto out;
-		}
 		for (i = 0; i < count; i++) {
 			addr_i = (struct efa_ep_addr *) ((uint8_t *)addr + i * EFA_EP_ADDR_LEN);
 			ret = efa_av_insert_addr(av, addr_i, &fi_addr_res,
@@ -527,50 +559,81 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr,
 	int ret = 0;
 	size_t i;
 	struct efa_av *av;
+	struct util_av_entry *util_av_entry;
 	struct efa_av_entry *av_entry;
-	struct efa_ep_addr addr;
+	struct rxr_peer *peer;
+	struct dlist_entry *ep_list_entry;
 
 	av = container_of(av_fid, struct efa_av, util_av.av_fid);
-	if (av->ep_type == FI_EP_RDM) {
-		fastlock_acquire(&av->util_av.lock);
+	if (av->ep_type == FI_EP_DGRAM) {
 		for (i = 0; i < count; i++) {
-			ret = efa_av_lookup(&av->util_av.av_fid, fi_addr[i],
-						&addr, &av->util_av.addrlen);
+			ret = efa_av_remove_ah(&av->util_av.av_fid, &fi_addr[i],
+					       1, flags);
 			if (ret)
-				goto release_lock;
+				goto out;
+		}
+		goto out;
+	}
+	fastlock_acquire(&av->util_av.lock);
+	for (i = 0; i < count; i++) {
+		if (fi_addr[i] == FI_ADDR_NOTAVAIL ||
+		    fi_addr[i] > av->util_av.count) {
+			ret = -FI_ENOENT;
+			goto release_lock;
+		}
+		util_av_entry = ofi_bufpool_get_ibuf(
+						av->util_av.av_entry_pool,
+						fi_addr[i]);
+		if (!util_av_entry) {
+			ret = -FI_ENOENT;
+			goto release_lock;
+		}
+		/*
+		 * If use_cnt is greater than 1, then just decrement
+		 * the count by 1, without removing the entry.
+		 */
+		if (ofi_atomic_get32(&util_av_entry->use_cnt) > 1) {
+			ret = ofi_av_remove_addr(&av->util_av, fi_addr[i]);
+			goto release_lock;
+		}
+		av_entry = (struct efa_av_entry *)util_av_entry->data;
 
-			ret = efa_av_remove_ah(&av->util_av.av_fid, &fi_addr[i], 1, flags);
+		/* Check if the peer is in use if it is then return */
+		dlist_foreach(&av->util_av.ep_list, ep_list_entry) {
+			peer = efa_ep_get_peer(ep_list_entry, fi_addr[i]);
+			ret = efa_peer_in_use(peer);
 			if (ret)
 				goto release_lock;
-			HASH_FIND(hh, av->av_map, &addr, av->util_av.addrlen, av_entry);
-			if (!av_entry) {
-				ret = -FI_EINVAL;
-				goto release_lock;
-			}
-			/* remove an address from shm provider's av */
-			if (rxr_env.enable_shm_transfer && av_entry->local_mapping) {
-				ret = fi_av_remove(av->shm_rdm_av, &av_entry->shm_rdm_addr, 1, flags);
-				if (ret)
-					goto err_free_av_entry;
-
-				av->shm_used--;
-				assert(av_entry->shm_rdm_addr < rxr_env.shm_av_size);
-				av->shm_rdm_addr_map[av_entry->shm_rdm_addr] = FI_ADDR_UNSPEC;
-			}
-			HASH_DEL(av->av_map, av_entry);
-			free(av_entry);
 		}
-		fastlock_release(&av->util_av.lock);
-	} else {
-		for (i = 0; i < count; i++) {
-			ret = efa_av_remove_ah(&av->util_av.av_fid, &fi_addr[i], 1, flags);
+
+		/* Only if the peer is not in use reset the peer */
+		dlist_foreach(&av->util_av.ep_list, ep_list_entry) {
+			peer = efa_ep_get_peer(ep_list_entry, fi_addr[i]);
+			if (peer->rx_init)
+				efa_peer_reset(peer);
+		}
+		ret = efa_av_remove_ah(&av->util_av.av_fid, &fi_addr[i], 1,
+				       flags);
+		if (ret)
+			goto release_lock;
+		/* remove an address from shm provider's av */
+		if (rxr_env.enable_shm_transfer && av_entry->local_mapping) {
+			ret = fi_av_remove(av->shm_rdm_av, &av_entry->shm_rdm_addr, 1, flags);
 			if (ret)
-				goto out;
+				goto err_free_av_entry;
+
+			av->shm_used--;
+			assert(av_entry->shm_rdm_addr < rxr_env.shm_av_size);
+			av->shm_rdm_addr_map[av_entry->shm_rdm_addr] = FI_ADDR_UNSPEC;
 		}
+		ret = ofi_av_remove_addr(&av->util_av, *fi_addr);
+		if (ret)
+			goto err_free_av_entry;
 	}
+	fastlock_release(&av->util_av.lock);
 	goto out;
 err_free_av_entry:
-	free(av_entry);
+	ofi_ibuf_free(util_av_entry);
 release_lock:
 	fastlock_release(&av->util_av.lock);
 out:
@@ -596,7 +659,6 @@ static struct fi_ops_av efa_av_ops = {
 static int efa_av_close(struct fid *fid)
 {
 	struct efa_av *av;
-	struct efa_av_entry *current_av_entry, *tmp;
 	int ret = 0;
 	int err = 0;
 	int i;
@@ -629,10 +691,6 @@ static int efa_av_close(struct fid *fid)
 			EFA_WARN(FI_LOG_AV, "Failed to close av: %s\n",
 				fi_strerror(ret));
 		}
-		HASH_ITER(hh, av->av_map, current_av_entry, tmp) {
-			HASH_DEL(av->av_map, current_av_entry);
-			free(current_av_entry);
-		}
 	}
 	free(av);
 	return err;
@@ -708,6 +766,7 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 			attr->count = MAX(attr->count, universe_size);
 
 		util_attr.addrlen = EFA_EP_ADDR_LEN;
+		util_attr.context_len = sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN;
 		util_attr.flags = 0;
 		ret = ofi_av_init(&efa_domain->util_domain, attr, &util_attr,
 					&av->util_av, context);
diff --git a/deps/libfabric/prov/efa/src/efa_cq.c b/deps/libfabric/prov/efa/src/efa_cq.c
index cc797d1c700b17fd221426a1ad1126f40a6dc340..2b8fe67a9a0c28b252751f81e3a20a5011f1f38a 100644
--- a/deps/libfabric/prov/efa/src/efa_cq.c
+++ b/deps/libfabric/prov/efa/src/efa_cq.c
@@ -79,6 +79,7 @@ ssize_t efa_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry,
 	entry->flags = efa_cq_wc_to_fi_flags(&wce->wc);
 	entry->err = EIO;
 	entry->prov_errno = wce->wc.ibv_wc.status;
+	EFA_WARN(FI_LOG_CQ, "Work completion status: %s\n", ibv_wc_status_str(wce->wc.ibv_wc.status));
 
 	/* We currently don't have err_data to give back to the user. */
 	if (FI_VERSION_GE(api_version, FI_VERSION(1, 5)))
diff --git a/deps/libfabric/prov/efa/src/efa_device.c b/deps/libfabric/prov/efa/src/efa_device.c
index 9850a7814ad31544485ea84c94550b3e0548b95d..d60da55e281bb614e4eaa502764880035a0e780e 100644
--- a/deps/libfabric/prov/efa/src/efa_device.c
+++ b/deps/libfabric/prov/efa/src/efa_device.c
@@ -87,6 +87,8 @@ int efa_device_init(void)
 	int ctx_idx;
 	int ret;
 
+	fastlock_init(&pd_list_lock);
+
 	device_list = ibv_get_device_list(&dev_cnt);
 	if (dev_cnt <= 0)
 		return -ENODEV;
@@ -97,12 +99,19 @@ int efa_device_init(void)
 		goto err_free_dev_list;
 	}
 
+	pd_list = calloc(dev_cnt, sizeof(*pd_list));
+	if (!pd_list) {
+		ret = -ENOMEM;
+		goto err_free_ctx_list;
+	}
+
 	for (ctx_idx = 0; ctx_idx < dev_cnt; ctx_idx++) {
 		ctx_list[ctx_idx] = efa_device_open(device_list[ctx_idx]);
 		if (!ctx_list[ctx_idx]) {
 			ret = -ENODEV;
 			goto err_close_devs;
 		}
+		ctx_list[ctx_idx]->dev_idx = ctx_idx;
 	}
 
 	ibv_free_device_list(device_list);
@@ -112,6 +121,8 @@ int efa_device_init(void)
 err_close_devs:
 	for (ctx_idx--; ctx_idx >= 0; ctx_idx--)
 		efa_device_close(ctx_list[ctx_idx]);
+	free(pd_list);
+err_free_ctx_list:
 	free(ctx_list);
 err_free_dev_list:
 	ibv_free_device_list(device_list);
@@ -119,6 +130,26 @@ err_free_dev_list:
 	return ret;
 }
 
+bool efa_device_support_rdma_read(void)
+{
+#ifdef HAVE_RDMA_SIZE
+	int err;
+	struct efadv_device_attr efadv_attr;
+
+	if (dev_cnt <=0)
+		return false;
+
+	assert(dev_cnt > 0);
+	err = efadv_query_device(ctx_list[0]->ibv_ctx, &efadv_attr, sizeof(efadv_attr));
+	if (err)
+		return false;
+
+	return efadv_attr.device_caps & EFADV_DEVICE_ATTR_CAPS_RDMA_READ;
+#else
+	return false;
+#endif
+}
+
 void efa_device_free(void)
 {
 	int i;
@@ -126,8 +157,10 @@ void efa_device_free(void)
 	for (i = 0; i < dev_cnt; i++)
 		efa_device_close(ctx_list[i]);
 
+	free(pd_list);
 	free(ctx_list);
 	dev_cnt = 0;
+	fastlock_destroy(&pd_list_lock);
 }
 
 struct efa_context **efa_device_get_context_list(int *num_ctx)
diff --git a/deps/libfabric/prov/efa/src/efa_domain.c b/deps/libfabric/prov/efa/src/efa_domain.c
index 0c01a0bab5e3f0044f980fe7f9cc6170eea3176b..f0add07ee654ce2455a0f8e4f0c0fa27c317aa19 100644
--- a/deps/libfabric/prov/efa/src/efa_domain.c
+++ b/deps/libfabric/prov/efa/src/efa_domain.c
@@ -37,30 +37,52 @@
 #include "efa.h"
 #include "rxr_cntr.h"
 
+fastlock_t pd_list_lock;
+struct efa_pd *pd_list = NULL;
+
 static int efa_domain_close(fid_t fid)
 {
 	struct efa_domain *domain;
+	struct efa_pd *efa_pd;
 	int ret;
 
 	domain = container_of(fid, struct efa_domain,
 			      util_domain.domain_fid.fid);
 
-	if (efa_mr_cache_enable)
-		ofi_mr_cache_cleanup(&domain->cache);
+	if (efa_is_cache_available(domain)) {
+		ofi_mr_cache_cleanup(domain->cache);
+		free(domain->cache);
+		domain->cache = NULL;
+	}
 
 	if (domain->ibv_pd) {
-		ret = -ibv_dealloc_pd(domain->ibv_pd);
-		if (ret) {
-			EFA_INFO_ERRNO(FI_LOG_DOMAIN, "ibv_dealloc_pd", ret);
-			return ret;
+		fastlock_acquire(&pd_list_lock);
+		efa_pd = &pd_list[domain->ctx->dev_idx];
+		if (efa_pd->use_cnt == 1) {
+			ret = -ibv_dealloc_pd(domain->ibv_pd);
+			if (ret) {
+				fastlock_release(&pd_list_lock);
+				EFA_INFO_ERRNO(FI_LOG_DOMAIN, "ibv_dealloc_pd",
+				               ret);
+				return ret;
+			}
+			efa_pd->ibv_pd = NULL;
 		}
+		efa_pd->use_cnt--;
 		domain->ibv_pd = NULL;
+		fastlock_release(&pd_list_lock);
 	}
 
 	ret = ofi_domain_close(&domain->util_domain);
 	if (ret)
 		return ret;
 
+	if (domain->shm_domain) {
+		ret = fi_close(&domain->shm_domain->fid);
+		if (ret)
+			return ret;
+	}
+
 	fi_freeinfo(domain->info);
 	free(domain->qp_table);
 	free(domain);
@@ -94,10 +116,70 @@ static int efa_open_device_by_name(struct efa_domain *domain, const char *name)
 		}
 	}
 
+	/*
+	 * Check if a PD has already been allocated for this device and reuse
+	 * it if this is the case.
+	 */
+	fastlock_acquire(&pd_list_lock);
+	if (pd_list[i].ibv_pd) {
+		domain->ibv_pd = pd_list[i].ibv_pd;
+		pd_list[i].use_cnt++;
+	} else {
+		domain->ibv_pd = ibv_alloc_pd(domain->ctx->ibv_ctx);
+		if (!domain->ibv_pd) {
+			ret = -errno;
+		} else {
+			pd_list[i].ibv_pd = domain->ibv_pd;
+			pd_list[i].use_cnt++;
+		}
+	}
+	fastlock_release(&pd_list_lock);
+
 	efa_device_free_context_list(ctx_list);
 	return ret;
 }
 
+/*
+ * Register a temporary buffer and call ibv_fork_init() to determine if fork
+ * support is enabled.
+ *
+ * This relies on internal behavior in rdma-core and is a temporary workaround.
+ */
+static int efa_check_fork_enabled(struct fid_domain *domain_fid)
+{
+	struct fid_mr *mr;
+	char *buf;
+	int ret;
+
+	buf = malloc(ofi_get_page_size());
+	if (!buf)
+		return -FI_ENOMEM;
+
+	ret = fi_mr_reg(domain_fid, buf, ofi_get_page_size(),
+			FI_SEND, 0, 0, 0, &mr, NULL);
+	if (ret) {
+		free(buf);
+		return ret;
+	}
+
+	/*
+	 * libibverbs maintains a global variable to determine if any
+	 * registrations have occurred before ibv_fork_init() is called.
+	 * EINVAL is returned if a memory region was registered before
+	 * ibv_fork_init() was called and returns 0 if fork support is
+	 * initialized already.
+	 */
+	ret = ibv_fork_init();
+
+	fi_close(&mr->fid);
+	free(buf);
+
+	if (ret == EINVAL)
+		return 0;
+
+	return 1;
+}
+
 static struct fi_ops efa_fid_ops = {
 	.size = sizeof(struct fi_ops),
 	.close = efa_domain_close,
@@ -125,9 +207,14 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info,
 {
 	struct efa_domain *domain;
 	struct efa_fabric *fabric;
+	struct rxr_domain *rxr_domain;
 	const struct fi_info *fi;
 	size_t qp_table_size;
+	bool app_mr_local;
 	int ret;
+	struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = {
+		[FI_HMEM_SYSTEM] = memhooks_monitor,
+	};
 
 	fi = efa_get_efa_info(info->domain_attr->name);
 	if (!fi)
@@ -163,21 +250,21 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info,
 		goto err_close_domain;
 	}
 
-	if (EFA_EP_TYPE_IS_RDM(info))
+	if (EFA_EP_TYPE_IS_RDM(info)) {
 		domain->type = EFA_DOMAIN_RDM;
-	else
+		rxr_domain = container_of(domain_fid, struct rxr_domain,
+					  rdm_domain);
+		app_mr_local = rxr_domain->rxr_mr_local;
+	} else {
 		domain->type = EFA_DOMAIN_DGRAM;
+		/* DGRAM always requires FI_MR_LOCAL */
+		app_mr_local = true;
+	}
 
 	ret = efa_open_device_by_name(domain, info->domain_attr->name);
 	if (ret)
 		goto err_free_info;
 
-	domain->ibv_pd = ibv_alloc_pd(domain->ctx->ibv_ctx);
-	if (!domain->ibv_pd) {
-		ret = -errno;
-		goto err_free_info;
-	}
-
 	domain->util_domain.domain_fid.fid.ops = &efa_fid_ops;
 	domain->util_domain.domain_fid.ops = &efa_domain_ops;
 	/* RMA mr_modes are being removed, since EFA layer
@@ -193,9 +280,38 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info,
 	domain->util_domain.mr_map.mode &= ~FI_MR_PROV_KEY;
 	domain->fab = fabric;
 
+	domain->util_domain.domain_fid.mr = &efa_domain_mr_ops;
+
 	*domain_fid = &domain->util_domain.domain_fid;
 
-	if (efa_mr_cache_enable) {
+	domain->cache = NULL;
+
+	/*
+	 * Check whether fork support is enabled when app does not request
+	 * FI_MR_LOCAL even if the cache is disabled.
+	 */
+	if (!app_mr_local && efa_check_fork_enabled(*domain_fid)) {
+		fprintf(stderr,
+		         "\nlibibverbs fork support is not supported by the EFA Libfabric\n"
+			 "provider when memory registrations are handled by the provider.\n"
+			 "\nFork support may currently be enabled via the RDMAV_FORK_SAFE\n"
+			 "or IBV_FORK_SAFE environment variable or another library in your\n"
+			 "application may be calling ibv_fork_init().\n"
+			 "\nPlease refer to https://github.com/ofiwg/libfabric/issues/6332\n"
+			 "for more information. Your job will now abort.\n");
+		abort();
+	}
+
+	/*
+	 * If FI_MR_LOCAL is set, we do not want to use the MR cache.
+	 */
+	if (!app_mr_local && efa_mr_cache_enable) {
+		domain->cache = (struct ofi_mr_cache *)calloc(1, sizeof(struct ofi_mr_cache));
+		if (!domain->cache) {
+			ret = -FI_ENOMEM;
+			goto err_free_info;
+		}
+
 		if (!efa_mr_max_cached_count)
 			efa_mr_max_cached_count = info->domain_attr->mr_cnt *
 			                          EFA_MR_CACHE_LIMIT_MULT;
@@ -204,11 +320,11 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info,
 			                         EFA_MR_CACHE_LIMIT_MULT;
 		cache_params.max_cnt = efa_mr_max_cached_count;
 		cache_params.max_size = efa_mr_max_cached_size;
-		domain->cache.entry_data_size = sizeof(struct efa_mr);
-		domain->cache.add_region = efa_mr_cache_entry_reg;
-		domain->cache.delete_region = efa_mr_cache_entry_dereg;
-		ret = ofi_mr_cache_init(&domain->util_domain, uffd_monitor,
-					&domain->cache);
+		domain->cache->entry_data_size = sizeof(struct efa_mr);
+		domain->cache->add_region = efa_mr_cache_entry_reg;
+		domain->cache->delete_region = efa_mr_cache_entry_dereg;
+		ret = ofi_mr_cache_init(&domain->util_domain, memory_monitors,
+					domain->cache);
 		if (!ret) {
 			domain->util_domain.domain_fid.mr = &efa_domain_mr_cache_ops;
 			EFA_INFO(FI_LOG_DOMAIN, "EFA MR cache enabled, max_cnt: %zu max_size: %zu\n",
@@ -217,9 +333,8 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info,
 		}
 	}
 
-	domain->util_domain.domain_fid.mr = &efa_domain_mr_ops;
-	efa_mr_cache_enable = 0;
-
+	free(domain->cache);
+	domain->cache = NULL;
 	return 0;
 err_free_info:
 	fi_freeinfo(domain->info);
diff --git a/deps/libfabric/prov/efa/src/efa_ep.c b/deps/libfabric/prov/efa/src/efa_ep.c
index 44d5c2248caaff417f67dc3ba94476e8d8ef7819..bbc376ea44695fc81b512206d5c8068fae109034 100644
--- a/deps/libfabric/prov/efa/src/efa_ep.c
+++ b/deps/libfabric/prov/efa/src/efa_ep.c
@@ -221,6 +221,9 @@ err:
 
 static void efa_ep_destroy(struct efa_ep *ep)
 {
+	if (ep->self_ah)
+		ibv_destroy_ah(ep->self_ah);
+
 	efa_ep_destroy_qp(ep->qp);
 	fi_freeinfo(ep->info);
 	free(ep->src_addr);
@@ -367,13 +370,34 @@ static int efa_ep_setflags(struct fid_ep *ep_fid, uint64_t flags)
 	return 0;
 }
 
+/* efa_ep_create_self_ah() create an address handler for
+ * an EP's own address. The address handler is used by
+ * an EP to read from itself. It is used to
+ * copy data from host memory to GPU memory.
+ */
+static inline
+int efa_ep_create_self_ah(struct efa_ep *ep, struct ibv_pd *ibv_pd)
+{
+	struct ibv_ah_attr ah_attr;
+	struct efa_ep_addr *self_addr;
+
+	self_addr = (struct efa_ep_addr *)ep->src_addr;
+
+	memset(&ah_attr, 0, sizeof(ah_attr));
+	ah_attr.port_num = 1;
+	ah_attr.is_global = 1;
+	memcpy(ah_attr.grh.dgid.raw, self_addr->raw, sizeof(self_addr->raw));
+	ep->self_ah = ibv_create_ah(ibv_pd, &ah_attr);
+	return ep->self_ah ? 0 : -FI_EINVAL;
+}
+
 static int efa_ep_enable(struct fid_ep *ep_fid)
 {
 	struct ibv_qp_init_attr_ex attr_ex = { 0 };
 	const struct fi_info *efa_info;
 	struct ibv_pd *ibv_pd;
 	struct efa_ep *ep;
-
+	int err;
 	ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid);
 
 	if (!ep->scq && !ep->rcq) {
@@ -436,7 +460,18 @@ static int efa_ep_enable(struct fid_ep *ep_fid)
 	attr_ex.qp_context = ep;
 	attr_ex.sq_sig_all = 1;
 
-	return efa_ep_create_qp_ex(ep, ibv_pd, &attr_ex);
+	err = efa_ep_create_qp_ex(ep, ibv_pd, &attr_ex);
+	if (err)
+		return err;
+
+	err = efa_ep_create_self_ah(ep, ibv_pd);
+	if (err) {
+		EFA_WARN(FI_LOG_EP_CTRL,
+			 "Endpoint cannot create ah for its own address\n");
+		efa_ep_destroy_qp(ep->qp);
+	}
+
+	return err;
 }
 
 static int efa_ep_control(struct fid *fid, int command, void *arg)
diff --git a/deps/libfabric/prov/efa/src/efa_fabric.c b/deps/libfabric/prov/efa/src/efa_fabric.c
index bb2615345b3ce9255f463949df3a369134ef47a1..006c47f82cff56b0b1c8186baa3af68345ea5b3a 100644
--- a/deps/libfabric/prov/efa/src/efa_fabric.c
+++ b/deps/libfabric/prov/efa/src/efa_fabric.c
@@ -83,6 +83,7 @@
 int efa_mr_cache_enable		= EFA_DEF_MR_CACHE_ENABLE;
 size_t efa_mr_max_cached_count;
 size_t efa_mr_max_cached_size;
+int efa_set_rdmav_hugepages_safe = 0;
 
 static void efa_addr_to_str(const uint8_t *raw_addr, char *str);
 static int efa_get_addr(struct efa_context *ctx, void *src_addr);
@@ -100,11 +101,7 @@ const struct fi_domain_attr efa_domain_attr = {
 	.control_progress	= FI_PROGRESS_AUTO,
 	.data_progress		= FI_PROGRESS_AUTO,
 	.resource_mgmt		= FI_RM_DISABLED,
-#ifdef HAVE_LIBCUDA
-	.mr_mode		= OFI_MR_BASIC_MAP | FI_MR_LOCAL | FI_MR_BASIC | FI_MR_HMEM,
-#else
 	.mr_mode		= OFI_MR_BASIC_MAP | FI_MR_LOCAL | FI_MR_BASIC,
-#endif
 	.mr_key_size		= sizeof_field(struct ibv_sge, lkey),
 	.cq_data_size		= 0,
 	.tx_ctx_cnt		= 1024,
@@ -427,9 +424,9 @@ static int efa_alloc_fid_nic(struct fi_info *fi, struct efa_context *ctx,
 
 	efa_addr_to_str(src_addr, link_attr->address);
 
-	link_attr->mtu = port_attr->max_msg_sz;
-
-	link_attr->speed = 0;
+	link_attr->mtu = port_attr->max_msg_sz - rxr_pkt_max_header_size();
+	link_attr->speed = ofi_vrb_speed(port_attr->active_speed,
+	                                 port_attr->active_width);
 
 	switch (port_attr->state) {
 	case IBV_PORT_DOWN:
@@ -469,6 +466,55 @@ err_free_nic:
 	return ret;
 }
 
+#if HAVE_LIBCUDA
+/*
+ * efa_get_gdr_support() check if GPUDirect RDMA is supported by
+ * reading from sysfs file "class/infiniband/<device_name>/gdr"
+ * and set content of gdr_support accordingly.
+ *
+ * Return value:
+ *   return 1 if sysfs file exist and has 1 in it.
+ *   return 0 if sysfs file does not exist or has 0 in it.
+ *   return a negatie value if error happened.
+ */
+static int efa_get_gdr_support(char *device_name)
+{
+	static const int MAX_GDR_SUPPORT_STRLEN = 8;
+	char *gdr_path = NULL;
+	char gdr_support_str[MAX_GDR_SUPPORT_STRLEN];
+	int ret, read_len;
+
+	ret = asprintf(&gdr_path, "class/infiniband/%s/device/gdr", device_name);
+	if (ret < 0) {
+		EFA_INFO_ERRNO(FI_LOG_FABRIC, "asprintf to build sysfs file name failed", ret);
+		goto out;
+	}
+
+	ret = fi_read_file(get_sysfs_path(), gdr_path,
+			   gdr_support_str, MAX_GDR_SUPPORT_STRLEN);
+	if (ret < 0) {
+		if (errno == ENOENT) {
+			/* sysfs file does not exist, gdr is not supported */
+			ret = 0;
+		}
+
+		goto out;
+	}
+
+	if (ret == 0) {
+		EFA_WARN(FI_LOG_FABRIC, "Sysfs file %s is empty\n", gdr_path);
+		ret = -FI_EINVAL;
+		goto out;
+	}
+
+	read_len = MIN(ret, MAX_GDR_SUPPORT_STRLEN);
+	ret = (0 == strncmp(gdr_support_str, "1", read_len));
+out:
+	free(gdr_path);
+	return ret;
+}
+#endif
+
 static int efa_get_device_attrs(struct efa_context *ctx, struct fi_info *info)
 {
 	struct efadv_device_attr efadv_attr;
@@ -515,6 +561,23 @@ static int efa_get_device_attrs(struct efa_context *ctx, struct fi_info *info)
 	info->domain_attr->resource_mgmt	= FI_RM_DISABLED;
 	info->domain_attr->mr_cnt		= base_attr->max_mr;
 
+#if HAVE_LIBCUDA
+	if (info->ep_attr->type == FI_EP_RDM) {
+		ret = efa_get_gdr_support(ctx->ibv_ctx->device->name);
+		if (ret < 0) {
+			EFA_WARN(FI_LOG_FABRIC, "get gdr support failed!\n");
+			return ret;
+		}
+
+		if (ret == 1) {
+			info->caps			|= FI_HMEM;
+			info->tx_attr->caps		|= FI_HMEM;
+			info->rx_attr->caps		|= FI_HMEM;
+			info->domain_attr->mr_mode	|= FI_MR_HMEM;
+		}
+	}
+#endif
+
 	EFA_DBG(FI_LOG_DOMAIN, "Domain attribute :\n"
 				"\t info->domain_attr->cq_cnt		= %zu\n"
 				"\t info->domain_attr->ep_cnt		= %zu\n"
@@ -861,7 +924,6 @@ static int efa_fabric_close(fid_t fid)
 	struct efa_fabric *fab;
 	int ret;
 
-	unsetenv("RDMAV_HUGEPAGES_SAFE");
 	fab = container_of(fid, struct efa_fabric, util_fabric.fabric_fid.fid);
 	ret = ofi_fabric_close(&fab->util_fabric);
 	if (ret)
@@ -888,36 +950,54 @@ static struct fi_ops_fabric efa_ops_fabric = {
 	.trywait = ofi_trywait
 };
 
+static
+void efa_atfork_callback()
+{
+	static int visited = 0;
+
+	if (visited)
+		return;
+
+	visited = 1;
+	if (getenv("RDMAV_FORK_SAFE") || getenv("IBV_FORK_SAFE") )
+		return;
+
+	fprintf(stderr,
+		"A process has executed an operation involving a call\n"
+		"to the fork() system call to create a child process.\n"
+		"\n"
+		"As a result, the libfabric EFA provider is operating in\n"
+		"a condition that could result in memory corruption or\n"
+		"other system errors.\n"
+		"\n"
+		"For the libfabric EFA provider to work safely when fork()\n"
+		"is called, the application must handle memory registrations\n"
+		"(FI_MR_LOCAL) and you will need to set the following environment\n"
+		"variables:\n"
+		"          RDMAV_FORK_SAFE=1\n"
+		"MPI applications do not support this mode.\n"
+		"\n"
+		"However, this setting can result in signficant performance\n"
+		"impact to your application due to increased cost of memory\n"
+		"registration.\n"
+		"\n"
+		"You may want to check with your application vendor to see\n"
+		"if an application-level alternative (of not using fork)\n"
+		"exists.\n"
+		"\n"
+		"Please refer to https://github.com/ofiwg/libfabric/issues/6332\n"
+		"for more information.\n"
+		"\n"
+		"Your job will now abort.\n");
+	abort();
+}
+
 int efa_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric_fid,
 	       void *context)
 {
 	const struct fi_info *info;
 	struct efa_fabric *fab;
 	int ret = 0;
-
-	/*
-	 * Enable rdma-core fork support and huge page support. We want call
-	 * this only when the EFA provider is selected. It is safe to call this
-	 * function again if multiple EFA fabrics are opened or if the fabric
-	 * is closed and opened again.
-	 *
-	 * TODO: allow users to disable this once the fork() to check ptrace
-	 * permissions is removed.
-	 */
-	ret = setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
-	if (ret)
-		return -errno;
-
-	ret = ibv_fork_init();
-	if (ret) {
-		EFA_WARN(FI_LOG_FABRIC, "Failed to initialize libibverbs "
-					"fork support. Please check your "
-					"application to ensure it is not "
-					"making verbs calls before "
-					"initializing EFA.\n");
-		return -ret;
-	}
-
 	fab = calloc(1, sizeof(*fab));
 	if (!fab)
 		return -FI_ENOMEM;
@@ -947,6 +1027,9 @@ static void fi_efa_fini(void)
 	struct efa_context **ctx_list;
 	int num_devices;
 
+	if (efa_set_rdmav_hugepages_safe)
+		unsetenv("RDMAV_HUGEPAGES_SAFE");
+
 	fi_freeinfo((void *)efa_util_prov.info);
 	efa_util_prov.info = NULL;
 
@@ -955,7 +1038,7 @@ static void fi_efa_fini(void)
 	efa_device_free();
 #if HAVE_EFA_DL
 	smr_cleanup();
-#endif 
+#endif
 }
 
 struct fi_provider efa_prov = {
@@ -1017,6 +1100,32 @@ static int efa_init_info(const struct fi_info **all_infos)
 
 struct fi_provider *init_lower_efa_prov()
 {
+	int err;
+
+	if (!getenv("RDMAV_HUGEPAGES_SAFE")) {
+		/*
+		 * Setting RDMAV_HUGEPAGES_SAFE alone will not impact
+		 * application performance, because rdma-core will only
+		 * check this environment variable when either
+		 * RDMAV_FORK_SAFE or IBV_FORK_SAFE is set.
+		 */
+		err = setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
+		if (err) {
+			EFA_WARN(FI_LOG_FABRIC,
+				 "Unable to set environment variable RDMAV_HUGEPAGES_SAFE\n");
+			return NULL;
+		}
+
+		efa_set_rdmav_hugepages_safe = 1;
+	}
+
+	err = pthread_atfork(efa_atfork_callback, NULL, NULL);
+	if (err) {
+		EFA_WARN(FI_LOG_FABRIC,
+			 "Unable to register atfork callback\n");
+		return NULL;
+	}
+
 	if (efa_init_info(&efa_util_prov.info))
 		return NULL;
 
diff --git a/deps/libfabric/prov/efa/src/efa_mr.c b/deps/libfabric/prov/efa/src/efa_mr.c
index 48528aa1898b2545030a77da68365ca6a2437e21..f89eb34bc34b6d3551cd845dde52a6f2aeb8c7e1 100644
--- a/deps/libfabric/prov/efa/src/efa_mr.c
+++ b/deps/libfabric/prov/efa/src/efa_mr.c
@@ -44,7 +44,7 @@ static int efa_mr_cache_close(fid_t fid)
 	struct efa_mr *efa_mr = container_of(fid, struct efa_mr,
 					       mr_fid.fid);
 
-	ofi_mr_cache_delete(&efa_mr->domain->cache, efa_mr->entry);
+	ofi_mr_cache_delete(efa_mr->domain->cache, efa_mr->entry);
 
 	return 0;
 }
@@ -151,6 +151,7 @@ static int efa_mr_cache_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 	struct efa_mr *efa_mr;
 	struct ofi_mr_entry *entry;
 	int ret;
+	static const int EFA_MR_CACHE_FLUSH_CHECK = 512;
 
 	if (flags & OFI_MR_NOCACHE) {
 		ret = efa_mr_regattr(fid, attr, flags, mr_fid);
@@ -166,15 +167,22 @@ static int efa_mr_cache_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 	domain = container_of(fid, struct efa_domain,
 			      util_domain.domain_fid.fid);
 
-	ret = ofi_mr_cache_search(&domain->cache, attr, &entry);
+	if (domain->cache->cached_cnt > 0 && domain->cache->cached_cnt % EFA_MR_CACHE_FLUSH_CHECK==0) {
+		ofi_mr_cache_flush(domain->cache, false);
+	}
+
+	ret = ofi_mr_cache_search(domain->cache, attr, &entry);
 	if (OFI_UNLIKELY(ret))
 		return ret;
 
 	efa_mr = (struct efa_mr *)entry->data;
 	efa_mr->entry = entry;
 
-	efa_mr->peer.iface = attr->iface;
-	if (attr->iface == FI_HMEM_CUDA)
+	if (domain->util_domain.info_domain_caps & FI_HMEM)
+		efa_mr->peer.iface = attr->iface;
+	else
+		efa_mr->peer.iface = FI_HMEM_SYSTEM;
+	if (efa_mr->peer.iface == FI_HMEM_CUDA)
 		efa_mr->peer.device.cuda = attr->device.cuda;
 
 	*mr_fid = &efa_mr->mr_fid;
@@ -280,7 +288,7 @@ struct fi_ops efa_mr_ops = {
  */
 static int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, void *attr)
 {
-	uint64_t core_access;
+	uint64_t core_access, original_access;
 	struct fi_mr_attr *mr_attr = (struct fi_mr_attr *)attr;
 	int fi_ibv_access = 0;
 	int ret = 0;
@@ -310,8 +318,15 @@ static int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, void *attr)
 
 	efa_mr->mr_fid.mem_desc = efa_mr;
 	efa_mr->mr_fid.key = efa_mr->ibv_mr->rkey;
-	efa_mr->peer.iface = mr_attr->iface;
-	if (mr_attr->iface == FI_HMEM_CUDA)
+	/*
+	 * Skipping the domain type check is okay here since util_domain is at
+	 * the beginning of efa_domain and rxr_domain.
+	 */
+	if (efa_mr->domain->util_domain.info_domain_caps & FI_HMEM)
+		efa_mr->peer.iface = mr_attr->iface;
+	else
+		efa_mr->peer.iface = FI_HMEM_SYSTEM;
+	if (efa_mr->peer.iface == FI_HMEM_CUDA)
 		efa_mr->peer.device.cuda = mr_attr->device.cuda;
 	assert(efa_mr->mr_fid.key != FI_KEY_NOTAVAIL);
 
@@ -327,8 +342,14 @@ static int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, void *attr)
 		return ret;
 	}
 	if (efa_mr->domain->shm_domain && rxr_env.enable_shm_transfer) {
+		/* We need to add FI_REMOTE_READ to allow for Read implemented
+		* message protocols.
+		*/
+		original_access = mr_attr->access;
+		mr_attr->access |= FI_REMOTE_READ;
 		ret = fi_mr_regattr(efa_mr->domain->shm_domain, attr,
 				    flags, &efa_mr->shm_mr);
+		mr_attr->access = original_access;
 		if (ret) {
 			EFA_WARN(FI_LOG_MR,
 				"Unable to register shm MR buf (%s): %p len: %zu\n",
diff --git a/deps/libfabric/prov/efa/src/efa_rma.c b/deps/libfabric/prov/efa/src/efa_rma.c
index ecddb5a11343c0036a6c8c40b3360339bcdbe1d3..97c681311c7683d5e129f3b5ff70b3e26fccddb3 100644
--- a/deps/libfabric/prov/efa/src/efa_rma.c
+++ b/deps/libfabric/prov/efa/src/efa_rma.c
@@ -37,8 +37,24 @@
 #include <ofi_iov.h>
 #include "efa.h"
 
-static
-ssize_t efa_rma_post_read(struct efa_ep *ep, const struct fi_msg_rma *msg, uint64_t flags)
+
+/*
+ * efa_rma_post_read() will post a read request.
+ *
+ * Input:
+ *     ep: endpoint
+ *     msg: read operation information
+ *     flags: currently no flags is taken
+ *     self_comm: indicate whether the read is toward
+ *                the end point itself. If self_comm is true,
+ *                caller must set msg->addr to FI_ADDR_NOTAVAIL.
+ *                
+ * On success return 0,
+ * If read iov and rma_iov count out of device limit, return -FI_EINVAL
+ * If read failed, return the error of read operation
+ */
+ssize_t efa_rma_post_read(struct efa_ep *ep, const struct fi_msg_rma *msg,
+			  uint64_t flags, bool self_comm)
 {
 	struct efa_qp *qp;
 	struct efa_mr *efa_mr;
@@ -80,8 +96,16 @@ ssize_t efa_rma_post_read(struct efa_ep *ep, const struct fi_msg_rma *msg, uint6
 	}
 
 	ibv_wr_set_sge_list(qp->ibv_qp_ex, msg->iov_count, sge_list);
-	conn = ep->av->addr_to_conn(ep->av, msg->addr);
-	ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah.ibv_ah, conn->ep_addr.qpn, conn->ep_addr.qkey);
+	if (self_comm) {
+		assert(msg->addr == FI_ADDR_NOTAVAIL);
+		ibv_wr_set_ud_addr(qp->ibv_qp_ex, ep->self_ah,
+				   qp->qp_num, qp->qkey);
+	} else {
+		conn = ep->av->addr_to_conn(ep->av, msg->addr);
+		ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah.ibv_ah,
+				   conn->ep_addr.qpn, conn->ep_addr.qkey);
+	}
+
 	return ibv_wr_complete(qp->ibv_qp_ex);
 }
 
@@ -90,7 +114,7 @@ ssize_t efa_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uin
 {
 	struct efa_ep *ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid);
 
-	return efa_rma_post_read(ep, msg, flags);
+	return efa_rma_post_read(ep, msg, flags, false);
 }
 
 static
diff --git a/deps/libfabric/prov/efa/src/rxr/efa_cuda.h b/deps/libfabric/prov/efa/src/rxr/efa_cuda.h
deleted file mode 100644
index af1d84e3ffdc9f05d3408197af739ed006c250ca..0000000000000000000000000000000000000000
--- a/deps/libfabric/prov/efa/src/rxr/efa_cuda.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
- * All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if HAVE_CONFIG_H
-#include <config.h>
-#endif /* HAVE_CONFIG_H */
-
-#ifndef _EFA_CUDA_H_
-#define _EFA_CUDA_H_
-
-#include "efa.h"
-#include "rxr.h"
-
-#ifdef HAVE_LIBCUDA
-
-#include <ofi_cuda.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-static inline bool rxr_ep_is_cuda_mr(struct efa_mr *efa_mr)
-{
-	return efa_mr ? (efa_mr->peer.iface == FI_HMEM_CUDA): false;
-}
-
-#else
-
-static inline bool rxr_ep_is_cuda_mr(struct efa_mr *efa_mr)
-{
-	return false;
-}
-
-#endif /* HAVE_LIBCUDA */
-
-static inline
-size_t rxr_copy_from_tx(void *buf, size_t tocopy,
-			struct rxr_tx_entry *tx_entry, size_t offset)
-{
-	size_t data_size;
-
-#ifdef HAVE_LIBCUDA
-	if (rxr_ep_is_cuda_mr(tx_entry->desc[0]))
-		data_size = ofi_copy_from_cuda_iov(buf,
-						   tocopy,
-						   tx_entry->iov,
-						   tx_entry->iov_count,
-						   offset);
-       else
-#endif
-		data_size = ofi_copy_from_iov(buf,
-					      tocopy,
-					      tx_entry->iov,
-					      tx_entry->iov_count,
-					      offset);
-	return data_size;
-}
-
-static inline
-size_t rxr_copy_to_rx(void *data, size_t tocopy, struct rxr_rx_entry *rx_entry, size_t offset)
-{
-	size_t data_size;
-#ifdef HAVE_LIBCUDA
-	if (rxr_ep_is_cuda_mr(rx_entry->desc[0]))
-		data_size = ofi_copy_to_cuda_iov(rx_entry->iov,
-						 rx_entry->iov_count,
-						 offset,
-						 data,
-						 tocopy);
-	else
-#endif
-		data_size = ofi_copy_to_iov(rx_entry->iov,
-					    rx_entry->iov_count,
-					    offset,
-					    data,
-					    tocopy);
-
-	return data_size;
-}
-
-
-#endif /* _EFA_CUDA_H_ */
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr.h b/deps/libfabric/prov/efa/src/rxr/rxr.h
index 167db56b16032f1cc787c555edfadfa59763abb6..8397b35a94724292db1c0328928ffd15da156fe1 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr.h
@@ -61,13 +61,11 @@
 #include <uthash.h>
 #include <ofi_recvwin.h>
 #include <ofi_perf.h>
+#include <ofi_hmem.h>
 
 #include "rxr_pkt_entry.h"
 #include "rxr_pkt_type.h"
 
-#define RXR_MAJOR_VERSION	(2)
-#define RXR_MINOR_VERSION	(0)
-
 /*
  * EFA support interoperability between protocol version 4 and above,
  * and version 4 is considered the base version.
@@ -166,6 +164,7 @@ static inline void rxr_poison_mem_region(uint32_t *ptr, size_t size)
  * 60 - 63      provider specific
  */
 #define RXR_NO_COMPLETION	BIT_ULL(60)
+#define RXR_NO_COUNTER		BIT_ULL(61)
 
 /*
  * RM flags
@@ -193,10 +192,13 @@ struct rxr_env {
 	int tx_max_credits;
 	int tx_queue_size;
 	int use_device_rdma;
+	int use_zcpy_rx;
+	int zcpy_rx_seed;
 	int enable_shm_transfer;
 	int shm_av_size;
 	int shm_max_medium_size;
 	int recvwin_size;
+	int readcopy_pool_size;
 	int cq_size;
 	size_t max_memcpy_size;
 	size_t mtu_size;
@@ -211,6 +213,7 @@ struct rxr_env {
 	size_t efa_cq_read_size;
 	size_t shm_cq_read_size;
 	size_t efa_max_medium_msg_size;
+	size_t efa_min_read_msg_size;
 	size_t efa_min_read_write_size;
 	size_t efa_read_segment_size;
 };
@@ -259,6 +262,11 @@ enum rxr_rx_comm_type {
 	RXR_RX_WAIT_ATOMRSP_SENT, /* rx_entry wait for atomrsp packet sent completion */
 };
 
+enum rxr_rx_buf_owner {
+	RXR_RX_PROV_BUF = 0,	 /* Bounce buffers allocated and owned by provider */
+	RXR_RX_USER_BUF,	 /* Recv buffers posted by applications */
+};
+
 #define RXR_PEER_REQ_SENT BIT_ULL(0) /* sent a REQ to the peer, peer should send a handshake back */
 #define RXR_PEER_HANDSHAKE_SENT BIT_ULL(1)
 #define RXR_PEER_HANDSHAKE_RECEIVED BIT_ULL(2)
@@ -295,7 +303,6 @@ struct rxr_peer {
 	int timeout_interval;		/* initial RNR timeout value */
 	int rnr_timeout_exp;		/* RNR timeout exponentation calc val */
 	struct dlist_entry rnr_entry;	/* linked to rxr_ep peer_backoff_list */
-	struct dlist_entry entry;	/* linked to rxr_ep peer_list */
 };
 
 struct rxr_queued_ctrl_info {
@@ -347,7 +354,8 @@ struct rxr_rx_entry {
 	uint64_t tag;
 	uint64_t ignore;
 
-	uint64_t bytes_done;
+	uint64_t bytes_received;
+	uint64_t bytes_copied;
 	int64_t window;
 	uint16_t credit_request;
 	int credit_cts;
@@ -363,8 +371,10 @@ struct rxr_rx_entry {
 	size_t iov_count;
 	struct iovec iov[RXR_IOV_LIMIT];
 
-	/* App-provided reg descriptor */
+	/* App-provided buffers and descriptors */
 	void *desc[RXR_IOV_LIMIT];
+	enum rxr_rx_buf_owner owner;
+	struct fi_msg *posted_recv;
 
 	/* iov_count on sender side, used for large message READ over shm */
 	size_t rma_iov_count;
@@ -488,6 +498,7 @@ struct rxr_domain {
 	size_t mtu_size;
 	size_t addrlen;
 	uint8_t mr_local;
+	uint8_t rxr_mr_local;
 	uint64_t rdm_mode;
 	int do_progress;
 	size_t cq_size;
@@ -506,8 +517,8 @@ struct rxr_ep {
 	/* per-peer information */
 	struct rxr_peer *peer;
 
-	/* free stack for reorder buffer */
-	struct rxr_robuf_fs *robuf_fs;
+	/* bufpool for reorder buffer */
+	struct ofi_bufpool *robuf_pool;
 
 	/* core provider fid */
 	struct fid_ep *rdm_ep;
@@ -532,6 +543,9 @@ struct rxr_ep {
 	/* core's capabilities */
 	uint64_t core_caps;
 
+	/* Endpoint's capability to support zero-copy rx */
+	bool use_zcpy_rx;
+
 	/* rx/tx queue size of core provider */
 	size_t core_rx_size;
 	size_t max_outstanding_tx;
@@ -546,6 +560,12 @@ struct rxr_ep {
 	/* core's supported tx/rx msg_order */
 	uint64_t core_msg_order;
 
+	/* Application's maximum msg size hint */
+	size_t max_msg_size;
+
+	/* RxR protocol's max header size */
+	size_t max_proto_hdr_size;
+
 	/* tx iov limit of core provider */
 	size_t core_iov_limit;
 
@@ -567,6 +587,11 @@ struct rxr_ep {
 	struct ofi_bufpool *rx_unexp_pkt_pool;
 	struct ofi_bufpool *rx_ooo_pkt_pool;
 
+	/* staging area for read copy */
+	struct ofi_bufpool *rx_readcopy_pkt_pool;
+	int rx_readcopy_pkt_pool_used;
+	int rx_readcopy_pkt_pool_max_used;
+
 #ifdef ENABLE_EFA_POISONING
 	size_t tx_pkt_pool_entry_sz;
 	size_t rx_pkt_pool_entry_sz;
@@ -605,8 +630,6 @@ struct rxr_ep {
 	struct dlist_entry read_pending_list;
 	/* rxr_peer entries that are in backoff due to RNR */
 	struct dlist_entry peer_backoff_list;
-	/* rxr_peer entries with an allocated robuf */
-	struct dlist_entry peer_list;
 
 #if ENABLE_DEBUG
 	/* rx_entries waiting for data to arrive (large messages) */
@@ -670,15 +693,25 @@ static inline struct rxr_peer *rxr_ep_get_peer(struct rxr_ep *ep,
 	return &ep->peer[addr];
 }
 
+static inline void rxr_setup_msg(struct fi_msg *msg, const struct iovec *iov, void **desc,
+				 size_t count, fi_addr_t addr, void *context, uint32_t data)
+{
+	msg->msg_iov = iov;
+	msg->desc = desc;
+	msg->iov_count = count;
+	msg->addr = addr;
+	msg->context = context;
+	msg->data = data;
+}
+
 static inline void rxr_ep_peer_init_rx(struct rxr_ep *ep, struct rxr_peer *peer)
 {
 	assert(!peer->rx_init);
 
-	peer->robuf = freestack_pop(ep->robuf_fs);
+	peer->robuf = ofi_buf_alloc(ep->robuf_pool);
+	assert(peer->robuf);
 	peer->robuf = ofi_recvwin_buf_alloc(peer->robuf,
 					    rxr_env.recvwin_size);
-	assert(peer->robuf);
-	dlist_insert_tail(&peer->entry, &ep->peer_list);
 	peer->rx_credits = rxr_env.rx_window_size;
 	peer->rx_init = 1;
 }
@@ -794,6 +827,17 @@ static inline int rxr_need_sas_ordering(struct rxr_ep *ep)
 	return ep->msg_order & FI_ORDER_SAS;
 }
 
+static inline int rxr_ep_use_zcpy_rx(struct rxr_ep *ep, struct fi_info *info)
+{
+	return !(ep->util_ep.caps & FI_DIRECTED_RECV) &&
+		!(ep->util_ep.caps & FI_TAGGED) &&
+		!(ep->util_ep.caps & FI_ATOMIC) &&
+		(ep->max_msg_size <= ep->mtu_size - ep->max_proto_hdr_size) &&
+		!rxr_need_sas_ordering(ep) &&
+		info->mode & FI_MSG_PREFIX &&
+		rxr_env.use_zcpy_rx;
+}
+
 /* Initialization functions */
 void rxr_reset_rx_tx_to_core(const struct fi_info *user_info,
 			     struct fi_info *core_info);
@@ -813,7 +857,8 @@ int rxr_endpoint(struct fid_domain *domain, struct fi_info *info,
 /* EP sub-functions */
 void rxr_ep_progress(struct util_ep *util_ep);
 void rxr_ep_progress_internal(struct rxr_ep *rxr_ep);
-int rxr_ep_post_buf(struct rxr_ep *ep, uint64_t flags, enum rxr_lower_ep_type lower_ep);
+int rxr_ep_post_buf(struct rxr_ep *ep, const struct fi_msg *posted_recv,
+		    uint64_t flags, enum rxr_lower_ep_type lower_ep);
 
 int rxr_ep_set_tx_credit_request(struct rxr_ep *rxr_ep,
 				 struct rxr_tx_entry *tx_entry);
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_attr.c b/deps/libfabric/prov/efa/src/rxr/rxr_attr.c
index ab1d96e350929415ddbcfe0f460db311a171684b..77b2a9eb6d97227b3aaac77347c8162da6d2c548 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_attr.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_attr.c
@@ -37,7 +37,7 @@
 const uint32_t rxr_poison_value = 0xdeadbeef;
 #endif
 
-#ifdef HAVE_LIBCUDA
+#if HAVE_LIBCUDA
 #define EFA_HMEM_CAP FI_HMEM
 #else
 #define EFA_HMEM_CAP 0
@@ -52,7 +52,7 @@ const uint32_t rxr_poison_value = 0xdeadbeef;
 /* TODO: Add support for true FI_DELIVERY_COMPLETE */
 #define RXR_TX_OP_FLAGS (FI_INJECT | FI_COMPLETION | FI_TRANSMIT_COMPLETE | \
 			 FI_DELIVERY_COMPLETE)
-#define RXR_RX_OP_FLAGS (FI_COMPLETION)
+#define RXR_RX_OP_FLAGS (FI_COMPLETION | FI_MULTI_RECV)
 
 struct fi_tx_attr rxr_tx_attr = {
 	.caps = RXR_TX_CAPS,
@@ -84,6 +84,7 @@ struct fi_ep_attr rxr_ep_attr = {
 	.mem_tag_format = FI_TAG_GENERIC,
 	.protocol_version = RXR_CUR_PROTOCOL_VERSION,
 	.max_msg_size = UINT64_MAX,
+	.msg_prefix_size = 0,
 	.tx_ctx_cnt = 1,
 	.rx_ctx_cnt = 1
 };
@@ -110,7 +111,7 @@ struct fi_domain_attr rxr_domain_attr = {
 };
 
 struct fi_fabric_attr rxr_fabric_attr = {
-	.prov_version = FI_VERSION(RXR_MAJOR_VERSION, RXR_MINOR_VERSION),
+	.prov_version = OFI_VERSION_DEF_PROV,
 };
 
 struct fi_info rxr_info = {
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_cq.c b/deps/libfabric/prov/efa/src/rxr/rxr_cq.c
index 94ea470c1dcd0b22c62627bf5a3d36e2de840da0..36218960b07e6d13ba156f9c3cb5566b32106de3 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_cq.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_cq.c
@@ -818,11 +818,16 @@ void rxr_cq_handle_tx_completion(struct rxr_ep *ep, struct rxr_tx_entry *tx_entr
 {
 	int ret;
 	struct rxr_peer *peer;
+	struct efa_domain *efa_domain;
+	struct rxr_domain *rxr_domain = rxr_ep_domain(ep);
+
+	efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
+				  util_domain.domain_fid);
 
 	if (tx_entry->state == RXR_TX_SEND)
 		dlist_remove(&tx_entry->entry);
 
-	if (efa_mr_cache_enable && rxr_ep_mr_local(ep)) {
+	if (efa_is_cache_available(efa_domain) && rxr_ep_mr_local(ep)) {
 		ret = rxr_tx_entry_mr_dereg(tx_entry);
 		if (OFI_UNLIKELY(ret)) {
 			FI_WARN(&rxr_prov, FI_LOG_MR,
@@ -847,7 +852,7 @@ void rxr_cq_handle_tx_completion(struct rxr_ep *ep, struct rxr_tx_entry *tx_entr
 
 		if (ep->util_ep.caps & FI_RMA_EVENT) {
 			rx_entry->cq_entry.len = rx_entry->total_len;
-			rx_entry->bytes_done = rx_entry->total_len;
+			rx_entry->bytes_copied = rx_entry->total_len;
 			efa_cntr_report_rx_completion(&ep->util_ep, rx_entry->cq_entry.flags);
 		}
 
@@ -858,7 +863,8 @@ void rxr_cq_handle_tx_completion(struct rxr_ep *ep, struct rxr_tx_entry *tx_entr
 		if (tx_entry->fi_flags & FI_COMPLETION) {
 			rxr_cq_write_tx_completion(ep, tx_entry);
 		} else {
-			efa_cntr_report_tx_completion(&ep->util_ep, tx_entry->cq_entry.flags);
+			if (!(tx_entry->fi_flags & RXR_NO_COUNTER))
+				efa_cntr_report_tx_completion(&ep->util_ep, tx_entry->cq_entry.flags);
 			rxr_release_tx_entry(ep, tx_entry);
 		}
 	} else {
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_domain.c b/deps/libfabric/prov/efa/src/rxr/rxr_domain.c
index 93846a4fd854ba5a45a0ba8a53a05d94b1390ef4..4edf617ff9e6598cacbe8ec5cf077ca8d05978f3 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_domain.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_domain.c
@@ -61,13 +61,9 @@ static int rxr_domain_close(fid_t fid)
 {
 	int ret;
 	struct rxr_domain *rxr_domain;
-	struct efa_domain *efa_domain;
 
 	rxr_domain = container_of(fid, struct rxr_domain,
 				  util_domain.domain_fid.fid);
-	efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
-				  util_domain.domain_fid);
-
 	ret = fi_close(&rxr_domain->rdm_domain->fid);
 	if (ret)
 		return ret;
@@ -76,12 +72,6 @@ static int rxr_domain_close(fid_t fid)
 	if (ret)
 		return ret;
 
-	if (rxr_env.enable_shm_transfer) {
-		ret = fi_close(&efa_domain->shm_domain->fid);
-		if (ret)
-			return ret;
-	}
-
 	free(rxr_domain);
 	return 0;
 }
@@ -190,6 +180,7 @@ int rxr_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 	if (!rxr_domain)
 		return -FI_ENOMEM;
 
+	rxr_domain->rxr_mr_local = ofi_mr_local(info);
 	rxr_domain->type = EFA_DOMAIN_RDM;
 
 	ret = rxr_get_lower_rdm_info(fabric->api_version, NULL, NULL, 0,
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_ep.c b/deps/libfabric/prov/efa/src/rxr/rxr_ep.c
index b9936a54edd67d5f0acb3966e67d5f712cd587cc..b1cecde6ec2e9b5a11fd6dfa1b928030cf96596e 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_ep.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_ep.c
@@ -59,7 +59,8 @@ struct rxr_rx_entry *rxr_ep_rx_entry_init(struct rxr_ep *ep,
 	rx_entry->addr = msg->addr;
 	rx_entry->fi_flags = flags;
 	rx_entry->rxr_flags = 0;
-	rx_entry->bytes_done = 0;
+	rx_entry->bytes_received = 0;
+	rx_entry->bytes_copied = 0;
 	rx_entry->window = 0;
 	rx_entry->iov_count = msg->iov_count;
 	rx_entry->tag = tag;
@@ -71,6 +72,8 @@ struct rxr_rx_entry *rxr_ep_rx_entry_init(struct rxr_ep *ep,
 
 	memset(&rx_entry->cq_entry, 0, sizeof(rx_entry->cq_entry));
 
+	rx_entry->owner = ep->use_zcpy_rx ? RXR_RX_USER_BUF : RXR_RX_PROV_BUF;
+
 	/* Handle case where we're allocating an unexpected rx_entry */
 	if (msg->msg_iov) {
 		memcpy(rx_entry->iov, msg->msg_iov, sizeof(*rx_entry->iov) * msg->iov_count);
@@ -257,10 +260,10 @@ struct rxr_rx_entry *rxr_ep_split_rx_entry(struct rxr_ep *ep,
 	return rx_entry;
 }
 
-/* Post buf as undirected recv (FI_ADDR_UNSPEC) */
-int rxr_ep_post_buf(struct rxr_ep *ep, uint64_t flags, enum rxr_lower_ep_type lower_ep_type)
+/* Post buffers as undirected recv (FI_ADDR_UNSPEC) */
+int rxr_ep_post_buf(struct rxr_ep *ep, const struct fi_msg *posted_recv, uint64_t flags, enum rxr_lower_ep_type lower_ep_type)
 {
-	struct fi_msg msg;
+	struct fi_msg msg = {0};
 	struct iovec msg_iov;
 	void *desc;
 	struct rxr_pkt_entry *rx_pkt_entry = NULL;
@@ -271,7 +274,10 @@ int rxr_ep_post_buf(struct rxr_ep *ep, uint64_t flags, enum rxr_lower_ep_type lo
 		rx_pkt_entry = rxr_pkt_entry_alloc(ep, ep->rx_pkt_shm_pool);
 		break;
 	case EFA_EP:
-		rx_pkt_entry = rxr_pkt_entry_alloc(ep, ep->rx_pkt_efa_pool);
+		if (posted_recv)
+			rx_pkt_entry = rxr_pkt_entry_init_prefix(ep, posted_recv, ep->rx_pkt_efa_pool);
+		else
+			rx_pkt_entry = rxr_pkt_entry_alloc(ep, ep->rx_pkt_efa_pool);
 		break;
 	default:
 		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
@@ -285,16 +291,10 @@ int rxr_ep_post_buf(struct rxr_ep *ep, uint64_t flags, enum rxr_lower_ep_type lo
 	}
 
 	rx_pkt_entry->x_entry = NULL;
-	rx_pkt_entry->type = RXR_PKT_ENTRY_POSTED;
 
 	msg_iov.iov_base = (void *)rxr_pkt_start(rx_pkt_entry);
 	msg_iov.iov_len = ep->mtu_size;
-
-	msg.msg_iov = &msg_iov;
-	msg.iov_count = 1;
-	msg.addr = FI_ADDR_UNSPEC;
-	msg.context = rx_pkt_entry;
-	msg.data = 0;
+	rxr_setup_msg(&msg, &msg_iov, NULL, 1, FI_ADDR_UNSPEC, rx_pkt_entry, 0);
 
 	switch (lower_ep_type) {
 	case SHM_EP:
@@ -316,13 +316,23 @@ int rxr_ep_post_buf(struct rxr_ep *ep, uint64_t flags, enum rxr_lower_ep_type lo
 		ep->posted_bufs_shm++;
 		break;
 	case EFA_EP:
-		/* pre-post buffer with efa */
 #if ENABLE_DEBUG
-		dlist_insert_tail(&rx_pkt_entry->dbg_entry,
-				  &ep->rx_posted_buf_list);
+		if (rx_pkt_entry->type != RXR_PKT_ENTRY_USER)
+			dlist_insert_tail(&rx_pkt_entry->dbg_entry,
+					  &ep->rx_posted_buf_list);
 #endif
 		desc = rxr_ep_mr_local(ep) ? fi_mr_desc(rx_pkt_entry->mr) : NULL;
 		msg.desc = &desc;
+		/*
+		 * Use the actual receive sizes from the application
+		 * rather than posting the full MTU size, like we do
+		 * when using the bufpool.
+		 */
+		if (posted_recv) {
+			msg_iov.iov_len = posted_recv->msg_iov->iov_len;
+			msg.data = posted_recv->data;
+			assert(msg_iov.iov_len <= ep->mtu_size);
+		}
 		ret = fi_recvmsg(ep->rdm_ep, &msg, flags);
 		if (OFI_UNLIKELY(ret)) {
 			rxr_pkt_entry_release_rx(ep, rx_pkt_entry);
@@ -372,6 +382,17 @@ void rxr_tx_entry_init(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 	else
 		memset(tx_entry->desc, 0, sizeof(tx_entry->desc));
 
+	/*
+	 * The prefix is currently not used by the sender, but needs to be
+	 * accounted for when copying the payload into the bounce-buffer.
+	 */
+	if (ep->use_zcpy_rx) {
+		assert(tx_entry->iov[0].iov_len >= sizeof(struct rxr_pkt_entry) + sizeof(struct rxr_eager_msgrtm_hdr));
+		tx_entry->iov[0].iov_base = (char *)tx_entry->iov[0].iov_base
+					     + sizeof(struct rxr_pkt_entry)
+					     + sizeof(struct rxr_eager_msgrtm_hdr);
+	}
+
 	/* set flags */
 	assert(ep->util_ep.tx_msg_flags == 0 ||
 	       ep->util_ep.tx_msg_flags == FI_COMPLETION);
@@ -546,9 +567,10 @@ int rxr_ep_set_tx_credit_request(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_
 
 static void rxr_ep_free_res(struct rxr_ep *rxr_ep)
 {
+	size_t i = 0;
 	struct rxr_peer *peer;
-	struct dlist_entry *tmp;
 #if ENABLE_DEBUG
+	struct dlist_entry *tmp;
 	struct dlist_entry *entry;
 	struct rxr_rx_entry *rx_entry;
 	struct rxr_tx_entry *tx_entry;
@@ -556,20 +578,18 @@ static void rxr_ep_free_res(struct rxr_ep *rxr_ep)
 #endif
 
 	if (rxr_need_sas_ordering(rxr_ep)) {
-		dlist_foreach_container_safe(&rxr_ep->peer_list,
-					     struct rxr_peer,
-					     peer, entry, tmp) {
-			ofi_recvwin_free(peer->robuf);
+		for (i = 0; i < rxr_ep->util_ep.av->count; ++i) {
+			peer = rxr_ep_get_peer(rxr_ep, i);
+			if (peer->rx_init)
+				efa_free_robuf(peer);
 		}
-
-		if (rxr_ep->robuf_fs)
-			rxr_robuf_fs_free(rxr_ep->robuf_fs);
+		if (rxr_ep->robuf_pool)
+			ofi_bufpool_destroy(rxr_ep->robuf_pool);
 	}
 
 #if ENABLE_DEBUG
-	dlist_foreach_container_safe(&rxr_ep->peer_list,
-				     struct rxr_peer,
-				     peer, entry, tmp) {
+	for (i = 0; i < rxr_ep->util_ep.av->count; ++i) {
+		peer = rxr_ep_get_peer(rxr_ep, i);
 		/*
 		 * TODO: Add support for wait/signal until all pending messages
 		 * have been sent/received so the core does not attempt to
@@ -608,9 +628,19 @@ static void rxr_ep_free_res(struct rxr_ep *rxr_ep)
 			rxr_pkt_entry_release_tx(rxr_ep, pkt);
 	}
 
-	dlist_foreach_safe(&rxr_ep->rx_pkt_list, entry, tmp) {
-		pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry);
-		rxr_pkt_entry_release_rx(rxr_ep, pkt);
+	if (!rxr_ep->use_zcpy_rx) {
+		/*
+		 * The provider does not own these entries, and there's no need
+		 * to deep-free them even in a debug build.
+		 */
+		dlist_foreach_safe(&rxr_ep->rx_pkt_list, entry, tmp) {
+			pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry);
+			rxr_pkt_entry_release_rx(rxr_ep, pkt);
+		}
+		dlist_foreach_safe(&rxr_ep->rx_posted_buf_list, entry, tmp) {
+			pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry);
+			ofi_buf_free(pkt);
+		}
 	}
 
 	dlist_foreach_safe(&rxr_ep->tx_pkt_list, entry, tmp) {
@@ -618,10 +648,6 @@ static void rxr_ep_free_res(struct rxr_ep *rxr_ep)
 		rxr_pkt_entry_release_tx(rxr_ep, pkt);
 	}
 
-	dlist_foreach_safe(&rxr_ep->rx_posted_buf_list, entry, tmp) {
-		pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry);
-		ofi_buf_free(pkt);
-	}
 	dlist_foreach_safe(&rxr_ep->rx_entry_list, entry, tmp) {
 		rx_entry = container_of(entry, struct rxr_rx_entry,
 					rx_entry_entry);
@@ -646,12 +672,24 @@ static void rxr_ep_free_res(struct rxr_ep *rxr_ep)
 	if (rxr_ep->tx_entry_pool)
 		ofi_bufpool_destroy(rxr_ep->tx_entry_pool);
 
+	if (rxr_ep->map_entry_pool)
+		ofi_bufpool_destroy(rxr_ep->map_entry_pool);
+
 	if (rxr_ep->read_entry_pool)
 		ofi_bufpool_destroy(rxr_ep->read_entry_pool);
 
 	if (rxr_ep->readrsp_tx_entry_pool)
 		ofi_bufpool_destroy(rxr_ep->readrsp_tx_entry_pool);
 
+	if (rxr_ep->rx_readcopy_pkt_pool) {
+		FI_INFO(&rxr_prov, FI_LOG_EP_CTRL, "current usage of read copy packet pool is %d\n",
+			rxr_ep->rx_readcopy_pkt_pool_used);
+		FI_INFO(&rxr_prov, FI_LOG_EP_CTRL, "maximum usage of read copy packet pool is %d\n",
+			rxr_ep->rx_readcopy_pkt_pool_max_used);
+		assert(!rxr_ep->rx_readcopy_pkt_pool_used);
+		ofi_bufpool_destroy(rxr_ep->rx_readcopy_pkt_pool);
+	}
+
 	if (rxr_ep->rx_ooo_pkt_pool)
 		ofi_bufpool_destroy(rxr_ep->rx_ooo_pkt_pool);
 
@@ -751,10 +789,13 @@ static int rxr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags)
 		if (!rxr_ep->peer)
 			return -FI_ENOMEM;
 
-		rxr_ep->robuf_fs = rxr_robuf_fs_create(av->util_av.count,
-						       NULL, NULL);
-		if (!rxr_ep->robuf_fs)
-			return -FI_ENOMEM;
+		if (rxr_need_sas_ordering(rxr_ep)) {
+			ret = ofi_bufpool_create(&rxr_ep->robuf_pool,
+						 sizeof(struct rxr_robuf), 16,
+						 0, 0, 0);
+			if (ret)
+				return ret;
+		}
 
 		/* Bind shm provider endpoint & shm av */
 		if (rxr_ep->use_shm) {
@@ -776,7 +817,7 @@ static int rxr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags)
 				 * Copy the entire peer array, because we may not be able to make the
 				 * assumption that insertions are always indexed in order in the future.
 				 */
-				for (i = 0; i <= av->util_av.count; i++) {
+				for (i = 0; i < av->util_av.count; i++) {
 					first_ep_peer = rxr_ep_get_peer(rxr_first_ep, i);
 					if (first_ep_peer->is_local) {
 						peer = rxr_ep_get_peer(rxr_ep, i);
@@ -840,7 +881,14 @@ static int rxr_ep_ctrl(struct fid *fid, int command, void *arg)
 		/* Enable core endpoints & post recv buff */
 		ep = container_of(fid, struct rxr_ep, util_ep.ep_fid.fid);
 
-		rx_size = rxr_get_rx_pool_chunk_cnt(ep);
+		/*
+		 * If the endpoint is configured for zero-copy receives, the
+		 * provider will use the application's undirected receives for
+		 * its internal control packets as well. The onus will be on the
+		 * application to ensure the receive queue is hydrated to avoid
+		 * RNRs.
+		 */
+		rx_size = ep->use_zcpy_rx ? rxr_env.zcpy_rx_seed : rxr_get_rx_pool_chunk_cnt(ep);
 		ret = fi_enable(ep->rdm_ep);
 		if (ret)
 			return ret;
@@ -853,7 +901,7 @@ static int rxr_ep_ctrl(struct fid *fid, int command, void *arg)
 			if (i == rx_size - 1)
 				flags = 0;
 
-			ret = rxr_ep_post_buf(ep, flags, EFA_EP);
+			ret = rxr_ep_post_buf(ep, NULL, flags, EFA_EP);
 
 			if (ret)
 				goto out;
@@ -891,7 +939,7 @@ static int rxr_ep_ctrl(struct fid *fid, int command, void *arg)
 				if (i == shm_rx_size - 1)
 					flags = 0;
 
-				ret = rxr_ep_post_buf(ep, flags, SHM_EP);
+				ret = rxr_ep_post_buf(ep, NULL, flags, SHM_EP);
 
 				if (ret)
 					goto out;
@@ -1070,8 +1118,18 @@ static void rxr_buf_region_free_hndlr(struct ofi_bufpool_region *region)
 			fi_strerror(-ret));
 }
 
+/*
+ * rxr_create_pkt_pool create a packet pool. The size of pool is fixed
+ * and the memory is registered with device.
+ *
+ * Important arguments:
+ *      size: packet entry size
+ *      flags: caller can specify OFI_BUFPOOL_HUGEPAGES so the pool
+ *             will be backed by huge pages.
+ */
 static int rxr_create_pkt_pool(struct rxr_ep *ep, size_t size,
 			       size_t chunk_count,
+			       size_t flags,
 			       struct ofi_bufpool **buf_pool)
 {
 	struct ofi_bufpool_attr attr = {
@@ -1085,7 +1143,7 @@ static int rxr_create_pkt_pool(struct rxr_ep *ep, size_t size,
 					rxr_buf_region_free_hndlr : NULL,
 		.init_fn	= NULL,
 		.context	= rxr_ep_domain(ep),
-		.flags		= OFI_BUFPOOL_HUGEPAGES,
+		.flags		= flags,
 	};
 
 	return ofi_bufpool_create_attr(&attr, buf_pool);
@@ -1103,11 +1161,13 @@ int rxr_ep_init(struct rxr_ep *ep)
 #endif
 
 	ret = rxr_create_pkt_pool(ep, entry_sz, rxr_get_tx_pool_chunk_cnt(ep),
+				  OFI_BUFPOOL_HUGEPAGES,
 				  &ep->tx_pkt_efa_pool);
 	if (ret)
 		goto err_out;
 
 	ret = rxr_create_pkt_pool(ep, entry_sz, rxr_get_rx_pool_chunk_cnt(ep),
+				  OFI_BUFPOOL_HUGEPAGES,
 				  &ep->rx_pkt_efa_pool);
 	if (ret)
 		goto err_free_tx_pool;
@@ -1130,12 +1190,36 @@ int rxr_ep_init(struct rxr_ep *ep)
 			goto err_free_rx_unexp_pool;
 	}
 
+	if ((rxr_env.rx_copy_unexp || rxr_env.rx_copy_ooo) &&
+	    (rxr_ep_domain(ep)->util_domain.mr_mode & FI_MR_HMEM)) {
+		/* this pool is only needed when application requested FI_HMEM
+		 * capability
+		 */
+		ret = rxr_create_pkt_pool(ep, entry_sz,
+					  rxr_env.readcopy_pool_size,
+					  0, &ep->rx_readcopy_pkt_pool);
+
+		if (ret)
+			goto err_free_rx_ooo_pool;
+
+		ret = ofi_bufpool_grow(ep->rx_readcopy_pkt_pool);
+		if (ret) {
+			FI_WARN(&rxr_prov, FI_LOG_CQ,
+				"cannot allocate and register memory for readcopy packet pool. error: %s\n",
+				strerror(-ret));
+			goto err_free_rx_readcopy_pool;
+		}
+
+		ep->rx_readcopy_pkt_pool_used = 0;
+		ep->rx_readcopy_pkt_pool_max_used = 0;
+	}
+
 	ret = ofi_bufpool_create(&ep->tx_entry_pool,
 				 sizeof(struct rxr_tx_entry),
 				 RXR_BUF_POOL_ALIGNMENT,
 				 ep->tx_size, ep->tx_size, 0);
 	if (ret)
-		goto err_free_rx_ooo_pool;
+		goto err_free_rx_readcopy_pool;
 
 	ret = ofi_bufpool_create(&ep->read_entry_pool,
 				 sizeof(struct rxr_read_entry),
@@ -1202,7 +1286,6 @@ int rxr_ep_init(struct rxr_ep *ep)
 	dlist_init(&ep->tx_pending_list);
 	dlist_init(&ep->read_pending_list);
 	dlist_init(&ep->peer_backoff_list);
-	dlist_init(&ep->peer_list);
 #if ENABLE_DEBUG
 	dlist_init(&ep->rx_pending_list);
 	dlist_init(&ep->rx_pkt_list);
@@ -1232,6 +1315,9 @@ err_free_read_entry_pool:
 err_free_tx_entry_pool:
 	if (ep->tx_entry_pool)
 		ofi_bufpool_destroy(ep->tx_entry_pool);
+err_free_rx_readcopy_pool:
+	if (ep->rx_readcopy_pkt_pool)
+		ofi_bufpool_destroy(ep->rx_readcopy_pkt_pool);
 err_free_rx_ooo_pool:
 	if (rxr_env.rx_copy_ooo && ep->rx_ooo_pkt_pool)
 		ofi_bufpool_destroy(ep->rx_ooo_pkt_pool);
@@ -1285,7 +1371,7 @@ static inline int rxr_ep_bulk_post_recv(struct rxr_ep *ep)
 	while (ep->rx_bufs_efa_to_post) {
 		if (ep->rx_bufs_efa_to_post == 1)
 			flags = 0;
-		ret = rxr_ep_post_buf(ep, flags, EFA_EP);
+		ret = rxr_ep_post_buf(ep, NULL, flags, EFA_EP);
 		if (OFI_LIKELY(!ret))
 			ep->rx_bufs_efa_to_post--;
 		else
@@ -1296,7 +1382,7 @@ static inline int rxr_ep_bulk_post_recv(struct rxr_ep *ep)
 	while (ep->use_shm && ep->rx_bufs_shm_to_post) {
 		if (ep->rx_bufs_shm_to_post == 1)
 			flags = 0;
-		ret = rxr_ep_post_buf(ep, flags, SHM_EP);
+		ret = rxr_ep_post_buf(ep, NULL, flags, SHM_EP);
 		if (OFI_LIKELY(!ret))
 			ep->rx_bufs_shm_to_post--;
 		else
@@ -1385,7 +1471,8 @@ static inline void rxr_ep_poll_cq(struct rxr_ep *ep,
 			if (rxr_cq_handle_cq_error(ep, ret))
 				assert(0 &&
 				       "error writing error cq entry after reading from cq");
-			rxr_ep_bulk_post_recv(ep);
+			if (!ep->use_zcpy_rx)
+				rxr_ep_bulk_post_recv(ep);
 			return;
 		}
 
@@ -1428,7 +1515,8 @@ void rxr_ep_progress_internal(struct rxr_ep *ep)
 	struct dlist_entry *tmp;
 	ssize_t ret;
 
-	rxr_ep_check_available_data_bufs_timer(ep);
+	if (!ep->use_zcpy_rx)
+		rxr_ep_check_available_data_bufs_timer(ep);
 
 	// Poll the EFA completion queue
 	rxr_ep_poll_cq(ep, ep->rdm_cq, rxr_env.efa_cq_read_size, 0);
@@ -1437,13 +1525,15 @@ void rxr_ep_progress_internal(struct rxr_ep *ep)
 	if (ep->use_shm)
 		rxr_ep_poll_cq(ep, ep->shm_cq, rxr_env.shm_cq_read_size, 1);
 
-	ret = rxr_ep_bulk_post_recv(ep);
+	if (!ep->use_zcpy_rx) {
+		ret = rxr_ep_bulk_post_recv(ep);
 
-	if (OFI_UNLIKELY(ret)) {
-		if (rxr_cq_handle_cq_error(ep, ret))
-			assert(0 &&
-			       "error writing error cq entry after failed post recv");
-		return;
+		if (OFI_UNLIKELY(ret)) {
+			if (rxr_cq_handle_cq_error(ep, ret))
+				assert(0 &&
+				       "error writing error cq entry after failed post recv");
+			return;
+		}
 	}
 
 	rxr_ep_check_peer_backoff_timer(ep);
@@ -1669,7 +1759,11 @@ int rxr_endpoint(struct fid_domain *domain, struct fi_info *info,
 	rxr_ep->msg_order = info->rx_attr->msg_order;
 	rxr_ep->core_msg_order = rdm_info->rx_attr->msg_order;
 	rxr_ep->core_inject_size = rdm_info->tx_attr->inject_size;
+	rxr_ep->max_msg_size = info->ep_attr->max_msg_size;
+	rxr_ep->max_proto_hdr_size = rxr_pkt_max_header_size();
 	rxr_ep->mtu_size = rdm_info->ep_attr->max_msg_size;
+	fi_freeinfo(rdm_info);
+
 	if (rxr_env.mtu_size > 0 && rxr_env.mtu_size < rxr_ep->mtu_size)
 		rxr_ep->mtu_size = rxr_env.mtu_size;
 
@@ -1677,17 +1771,16 @@ int rxr_endpoint(struct fid_domain *domain, struct fi_info *info,
 		rxr_ep->mtu_size = RXR_MTU_MAX_LIMIT;
 
 	rxr_ep->max_data_payload_size = rxr_ep->mtu_size - sizeof(struct rxr_data_hdr);
-	/*
-	 * Assume our eager message size is the largest control header size
-	 * without the source address. Use that value to set the default
-	 * receive release threshold.
-	 */
-	rxr_ep->min_multi_recv_size = rxr_ep->mtu_size - sizeof(struct rxr_eager_tagrtm_hdr) - sizeof(struct rxr_req_opt_cq_data_hdr);
+	rxr_ep->min_multi_recv_size = rxr_ep->mtu_size - rxr_ep->max_proto_hdr_size;
 
 	if (rxr_env.tx_queue_size > 0 &&
 	    rxr_env.tx_queue_size < rxr_ep->max_outstanding_tx)
 		rxr_ep->max_outstanding_tx = rxr_env.tx_queue_size;
 
+
+	rxr_ep->use_zcpy_rx = rxr_ep_use_zcpy_rx(rxr_ep, info);
+	FI_INFO(&rxr_prov, FI_LOG_EP_CTRL, "rxr_ep->use_zcpy_rx = %d\n", rxr_ep->use_zcpy_rx);
+
 #if ENABLE_DEBUG
 	rxr_ep->sends = 0;
 	rxr_ep->send_comps = 0;
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_init.c b/deps/libfabric/prov/efa/src/rxr/rxr_init.c
index 0c54c8bb088d8a61e968ca496596c100b3d9dd77..a2230545f0e0ea66d053b259466f56fd14020302 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_init.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_init.c
@@ -36,6 +36,7 @@
 #include <ofi_prov.h>
 #include "rxr.h"
 #include "efa.h"
+#include "ofi_hmem.h"
 
 struct fi_info *shm_info;
 
@@ -50,9 +51,12 @@ struct rxr_env rxr_env = {
 	.tx_queue_size = 0,
 	.enable_shm_transfer = 1,
 	.use_device_rdma = 0,
+	.use_zcpy_rx = 1,
+	.zcpy_rx_seed = 0,
 	.shm_av_size = 128,
 	.shm_max_medium_size = 4096,
 	.recvwin_size = RXR_RECVWIN_SIZE,
+	.readcopy_pool_size = 256,
 	.cq_size = RXR_DEF_CQ_SIZE,
 	.max_memcpy_size = 4096,
 	.mtu_size = 0,
@@ -67,6 +71,7 @@ struct rxr_env rxr_env = {
 	.efa_cq_read_size = 50,
 	.shm_cq_read_size = 50,
 	.efa_max_medium_msg_size = 65536,
+	.efa_min_read_msg_size = 1048576,
 	.efa_min_read_write_size = 65536,
 	.efa_read_segment_size = 1073741824,
 };
@@ -79,9 +84,12 @@ static void rxr_init_env(void)
 	fi_param_get_int(&rxr_prov, "tx_queue_size", &rxr_env.tx_queue_size);
 	fi_param_get_int(&rxr_prov, "enable_shm_transfer", &rxr_env.enable_shm_transfer);
 	fi_param_get_int(&rxr_prov, "use_device_rdma", &rxr_env.use_device_rdma);
+	fi_param_get_int(&rxr_prov, "use_zcpy_rx", &rxr_env.use_zcpy_rx);
+	fi_param_get_int(&rxr_prov, "zcpy_rx_seed", &rxr_env.zcpy_rx_seed);
 	fi_param_get_int(&rxr_prov, "shm_av_size", &rxr_env.shm_av_size);
 	fi_param_get_int(&rxr_prov, "shm_max_medium_size", &rxr_env.shm_max_medium_size);
 	fi_param_get_int(&rxr_prov, "recvwin_size", &rxr_env.recvwin_size);
+	fi_param_get_int(&rxr_prov, "readcopy_pool_size", &rxr_env.readcopy_pool_size);
 	fi_param_get_int(&rxr_prov, "cq_size", &rxr_env.cq_size);
 	fi_param_get_size_t(&rxr_prov, "max_memcpy_size",
 			    &rxr_env.max_memcpy_size);
@@ -110,6 +118,8 @@ static void rxr_init_env(void)
 			 &rxr_env.shm_cq_read_size);
 	fi_param_get_size_t(&rxr_prov, "inter_max_medium_message_size",
 			    &rxr_env.efa_max_medium_msg_size);
+	fi_param_get_size_t(&rxr_prov, "inter_min_read_message_size",
+			    &rxr_env.efa_min_read_msg_size);
 	fi_param_get_size_t(&rxr_prov, "inter_min_read_write_size",
 			    &rxr_env.efa_min_read_write_size);
 	fi_param_get_size_t(&rxr_prov, "inter_read_segment_size",
@@ -169,7 +179,7 @@ void rxr_info_to_core_mr_modes(uint32_t version,
 					hints->domain_attr->mr_mode & OFI_MR_BASIC_MAP;
 			core_info->addr_format = hints->addr_format;
 		}
-#ifdef HAVE_LIBCUDA
+#if HAVE_LIBCUDA
 		core_info->domain_attr->mr_mode |= FI_MR_HMEM;
 #endif
 	}
@@ -212,6 +222,9 @@ static int rxr_copy_attr(const struct fi_info *info, struct fi_info *dup)
 		if (!dup->nic)
 			return -FI_ENOMEM;
 	}
+	if (info->caps & FI_HMEM)
+		dup->caps |= FI_HMEM;
+
 	return 0;
 }
 
@@ -276,6 +289,10 @@ void rxr_reset_rx_tx_to_core(const struct fi_info *user_info,
 		user_info->tx_attr->size : core_info->tx_attr->size;
 }
 
+/*
+ * Used to set tx/rx attributes that are characteristic of the device for the
+ * two endpoint types and not emulated in software.
+ */
 void rxr_set_rx_tx_size(struct fi_info *info,
 			const struct fi_info *core_info)
 {
@@ -326,6 +343,16 @@ static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info,
 	info->domain_attr->cq_cnt = core_info->domain_attr->cq_cnt;
 	info->domain_attr->mr_key_size = core_info->domain_attr->mr_key_size;
 
+	/*
+	 * Do not advertise FI_HMEM capabilities when the core can not support
+	 * it or when the application passes NULL hints (given this is a primary
+	 * cap). The logic for device-specific checks pertaining to HMEM comes
+	 * further along this path.
+	 */
+	if ((core_info && !(core_info->caps & FI_HMEM)) || !hints) {
+		info->caps &= ~FI_HMEM;
+	}
+
 	/*
 	 * Handle user-provided hints and adapt the info object passed back up
 	 * based on EFA-specific constraints.
@@ -368,7 +395,7 @@ static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info,
 		if (!hints->domain_attr || hints->domain_attr->av_type == FI_AV_UNSPEC)
 			info->domain_attr->av_type = FI_AV_TABLE;
 
-#ifdef HAVE_LIBCUDA
+#if HAVE_LIBCUDA
 		/* If the application requires HMEM support, we will add FI_MR_HMEM
 		 * to mr_mode, because we need application to provide descriptor
 		 * for cuda buffer.
@@ -381,6 +408,20 @@ static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info,
 		 * which means FI_MR_HMEM implies FI_MR_LOCAL for cuda buffer
 		 */
 		if (hints->caps & FI_HMEM) {
+
+			if (!efa_device_support_rdma_read()) {
+				FI_INFO(&rxr_prov, FI_LOG_CORE,
+				        "FI_HMEM capability requires RDMA, which this device does not support.\n");
+				return -FI_ENODATA;
+
+			}
+
+			if (!rxr_env.use_device_rdma) {
+				FI_INFO(&rxr_prov, FI_LOG_CORE,
+				        "FI_HMEM capability requires RDMA, which is turned off. You can turn it on by set environment variable FI_EFA_USE_DEVICE_RDMA to 1.\n");
+				return -FI_ENODATA;
+			}
+
 			if (hints->domain_attr &&
 			    !(hints->domain_attr->mr_mode & FI_MR_HMEM)) {
 				FI_INFO(&rxr_prov, FI_LOG_CORE,
@@ -390,19 +431,49 @@ static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info,
 
 			info->domain_attr->mr_mode |= FI_MR_HMEM;
 
+		} else {
 			/*
-			 * If in this case application add FI_MR_LOCAL to hints,
-			 * it would mean that application want provide descriptor
-			 * for system memory too, which we are able to use, so
-			 * we add FI_MR_LOCAL to mr_mode.
-			 *
-			 * TODO: add FI_MR_LOCAL to mr_mode for any applcations
-			 * the requested it, not just CUDA application.
+			 * FI_HMEM is a primary capability. Providers should
+			 * only enable it if requested by applications.
 			 */
-			if (hints->domain_attr->mr_mode & FI_MR_LOCAL)
-				info->domain_attr->mr_mode |= FI_MR_LOCAL;
+			info->caps &= ~FI_HMEM;
 		}
 #endif
+		/*
+		 * The provider does not force applications to register buffers
+		 * with the device, but if an application is able to, reuse
+		 * their registrations and avoid the bounce buffers.
+		 */
+		if (hints->domain_attr && hints->domain_attr->mr_mode & FI_MR_LOCAL)
+			info->domain_attr->mr_mode |= FI_MR_LOCAL;
+
+		/*
+		 * Same goes for prefix mode, where the protocol does not
+		 * absolutely need a prefix before receive buffers, but it can
+		 * use it when available to optimize transfers with endpoints
+		 * having the following profile:
+		 *	- Requires FI_MSG and not FI_TAGGED/FI_ATOMIC/FI_RMA
+		 *	- Can handle registrations (FI_MR_LOCAL)
+		 *	- No need for FI_DIRECTED_RECV
+		 *	- Guaranteed to send msgs smaller than info->nic->link_attr->mtu
+		 */
+		if (hints->mode & FI_MSG_PREFIX) {
+			FI_INFO(&rxr_prov, FI_LOG_CORE,
+				"FI_MSG_PREFIX supported by application.\n");
+			info->mode |= FI_MSG_PREFIX;
+			info->tx_attr->mode |= FI_MSG_PREFIX;
+			info->rx_attr->mode |= FI_MSG_PREFIX;
+
+			/*
+			 * The prefix needs to be a multiple of 8. The pkt_entry
+			 * is already at 64 bytes (128 with debug).
+			 */
+			info->ep_attr->msg_prefix_size =  sizeof(struct rxr_pkt_entry)
+							  + sizeof(struct rxr_eager_msgrtm_hdr);
+			assert(!(info->ep_attr->msg_prefix_size % 8));
+			FI_INFO(&rxr_prov, FI_LOG_CORE,
+				"FI_MSG_PREFIX size = %ld\n", info->ep_attr->msg_prefix_size);
+		}
 	}
 
 	rxr_set_rx_tx_size(info, core_info);
@@ -590,7 +661,7 @@ dgram_info:
 		ret = 0;
 
 	if (!ret && rxr_env.enable_shm_transfer && !shm_info) {
-		shm_info = fi_allocinfo();
+		shm_info = NULL;
 		shm_hints = fi_allocinfo();
 		rxr_set_shm_hints(shm_hints);
 		ret = fi_getinfo(FI_VERSION(1, 8), NULL, NULL,
@@ -635,15 +706,16 @@ static void rxr_fini(void)
 	}
 
 #if HAVE_EFA_DL
-	ofi_monitor_cleanup();
+	ofi_monitors_cleanup();
+	ofi_hmem_cleanup();
 	ofi_mem_fini();
 #endif
 }
 
 struct fi_provider rxr_prov = {
 	.name = "efa",
-	.version = FI_VERSION(RXR_MAJOR_VERSION, RXR_MINOR_VERSION),
-	.fi_version = RXR_FI_VERSION,
+	.version = OFI_VERSION_DEF_PROV,
+	.fi_version = OFI_VERSION_LATEST,
 	.getinfo = rxr_getinfo,
 	.fabric = rxr_fabric,
 	.cleanup = rxr_fini
@@ -663,12 +735,18 @@ EFA_INI
 			"Enable using SHM provider to provide the communication between processes on the same system. (Default: 1)");
 	fi_param_define(&rxr_prov, "use_device_rdma", FI_PARAM_INT,
 			"whether to use device's RDMA functionality for one-sided and two-sided transfer.");
+	fi_param_define(&rxr_prov, "use_zcpy_rx", FI_PARAM_INT,
+			"Enables the use of application's receive buffers in place of bounce-buffers when feasible. (Default: 1)");
+	fi_param_define(&rxr_prov, "zcpy_rx_seed", FI_PARAM_INT,
+			"Defines the number of bounce-buffers the provider will prepost during EP initialization.  (Default: 0)");
 	fi_param_define(&rxr_prov, "shm_av_size", FI_PARAM_INT,
 			"Defines the maximum number of entries in SHM provider's address vector (Default 128).");
 	fi_param_define(&rxr_prov, "shm_max_medium_size", FI_PARAM_INT,
 			"Defines the switch point between small/medium message and large message. The message larger than this switch point will be transferred with large message protocol (Default 4096).");
 	fi_param_define(&rxr_prov, "recvwin_size", FI_PARAM_INT,
 			"Defines the size of sliding receive window. (Default: 16384)");
+	fi_param_define(&rxr_prov, "readcopy_pool_size", FI_PARAM_INT,
+			"Defines the size of readcopy packet pool size. (Default: 256)");
 	fi_param_define(&rxr_prov, "cq_size", FI_PARAM_INT,
 			"Define the size of completion queue. (Default: 8192)");
 	fi_param_define(&rxr_prov, "mr_cache_enable", FI_PARAM_BOOL,
@@ -702,7 +780,10 @@ EFA_INI
 	fi_param_define(&rxr_prov, "shm_cq_read_size", FI_PARAM_SIZE_T,
 			"Set the number of SHM completion entries to read for one loop for one iteration of the progress engine. (Default: 50)");
 	fi_param_define(&rxr_prov, "inter_max_medium_message_size", FI_PARAM_INT,
-			"The maximum message size for inter EFA medium message protocol, messages whose size is larger than this value will be sent either by read message protocol (depend on firmware support), or long message protocol (Default 65536).");
+			"The maximum message size for inter EFA medium message protocol (Default 65536).");
+	fi_param_define(&rxr_prov, "inter_min_read_message_size", FI_PARAM_INT,
+			"The minimum message size for inter EFA read message protocol. If instance support RDMA read, messages whose size is larger than this value will be sent by read message protocol (Default 1048576).");
+
 	fi_param_define(&rxr_prov, "inter_min_read_write_size", FI_PARAM_INT,
 			"The mimimum message size for inter EFA write to use read write protocol. If firmware support RDMA read, and FI_EFA_USE_DEVICE_RDMA is 1, write requests whose size is larger than this value will use the read write protocol (Default 65536).");
 	fi_param_define(&rxr_prov, "inter_read_segment_size", FI_PARAM_INT,
@@ -711,7 +792,8 @@ EFA_INI
 
 #if HAVE_EFA_DL
 	ofi_mem_init();
-	ofi_monitor_init();
+	ofi_hmem_init();
+	ofi_monitors_init();
 #endif
 
 	lower_efa_prov = init_lower_efa_prov();
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_msg.c b/deps/libfabric/prov/efa/src/rxr/rxr_msg.c
index 6fd08fe08b5b39afd65e95a81c87f533fe8d2018..adcc044063b51e9207bca733dec403f7f8bd74ef 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_msg.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_msg.c
@@ -58,6 +58,46 @@
 /**
  *   Utility functions used by both non-tagged and tagged send.
  */
+static inline
+ssize_t rxr_msg_post_cuda_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry)
+{
+	int err, tagged;
+	struct rxr_peer *peer;
+
+	assert(RXR_EAGER_MSGRTM_PKT + 1 == RXR_EAGER_TAGRTM_PKT);
+	assert(RXR_READ_MSGRTM_PKT + 1 == RXR_READ_TAGRTM_PKT);
+
+	tagged = (tx_entry->op == ofi_op_tagged);
+	assert(tagged == 0 || tagged == 1);
+
+	if (tx_entry->total_len == 0)
+		return rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry,
+							  RXR_EAGER_MSGRTM_PKT + tagged, 0);
+
+	/* Currently cuda data must be sent using read message protocol.
+	 * However, because read message protocol is an extra feature, we cannot
+	 * sure if the receiver supports it.
+	 * The only way we can be sure of that is through the handshake packet
+	 * from the receiver, so here we call rxr_pkt_wait_handshake().
+	 */
+	peer = rxr_ep_get_peer(rxr_ep, tx_entry->addr);
+	assert(peer);
+	err = rxr_pkt_wait_handshake(rxr_ep, tx_entry->addr, peer);
+	if (OFI_UNLIKELY(err)) {
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "waiting for handshake packet failed!\n");
+		return err;
+	}
+
+	assert(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED);
+	if (!efa_peer_support_rdma_read(peer)) {
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "Cannot send gpu data because receiver does not support RDMA\n");
+		return -FI_EOPNOTSUPP;
+	}
+
+	return rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry,
+					  RXR_READ_MSGRTM_PKT + tagged, 0);
+}
+
 ssize_t rxr_msg_post_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry)
 {
 	/*
@@ -73,6 +113,12 @@ ssize_t rxr_msg_post_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry)
 	size_t max_rtm_data_size;
 	ssize_t err;
 	struct rxr_peer *peer;
+	struct efa_domain *efa_domain;
+	struct rxr_domain *rxr_domain = rxr_ep_domain(rxr_ep);
+
+	efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
+				  util_domain.domain_fid);
+
 
 	assert(tx_entry->op == ofi_op_msg || tx_entry->op == ofi_op_tagged);
 	tagged = (tx_entry->op == ofi_op_tagged);
@@ -93,6 +139,24 @@ ssize_t rxr_msg_post_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry)
 		return rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry, rtm_type + tagged, 0);
 	}
 
+	if (rxr_ep->use_zcpy_rx) {
+		/*
+		 * The application can not deal with varying packet header sizes
+		 * before and after receiving a handshake. Forcing a handshake
+		 * here so we can always use the smallest eager msg packet
+		 * header size to determine the msg_prefix_size.
+		 */
+		err = rxr_pkt_wait_handshake(rxr_ep, tx_entry->addr, peer);
+		if (OFI_UNLIKELY(err))
+			return err;
+
+		assert(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED);
+	}
+
+	if (efa_ep_is_cuda_mr(tx_entry->desc[0])) {
+		return rxr_msg_post_cuda_rtm(rxr_ep, tx_entry);
+	}
+
 	/* inter instance message */
 	if (tx_entry->total_len <= max_rtm_data_size)
 		return rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry,
@@ -102,15 +166,15 @@ ssize_t rxr_msg_post_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry)
 		/* we do not check the return value of rxr_ep_init_mr_desc()
 		 * because medium message works even if MR registration failed
 		 */
-		if (efa_mr_cache_enable)
-			rxr_ep_tx_init_mr_desc(rxr_ep_domain(rxr_ep),
-					       tx_entry, 0, FI_SEND);
+		if (efa_is_cache_available(efa_domain))
+			rxr_ep_tx_init_mr_desc(rxr_domain, tx_entry, 0, FI_SEND);
 		return rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry,
 						  RXR_MEDIUM_MSGRTM_PKT + tagged, 0);
 	}
 
-	if (efa_both_support_rdma_read(rxr_ep, peer) &&
-	    (tx_entry->desc[0] || efa_mr_cache_enable)) {
+	if (tx_entry->total_len >= rxr_env.efa_min_read_msg_size &&
+	    efa_both_support_rdma_read(rxr_ep, peer) &&
+	    (tx_entry->desc[0] || efa_is_cache_available(efa_domain))) {
 		/* use read message protocol */
 		err = rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry,
 						 RXR_READ_MSGRTM_PKT + tagged, 0);
@@ -197,15 +261,9 @@ ssize_t rxr_msg_sendv(struct fid_ep *ep, const struct iovec *iov,
 		      void *context)
 {
 	struct rxr_ep *rxr_ep;
-	struct fi_msg msg;
-
-	memset(&msg, 0, sizeof(msg));
-	msg.msg_iov = iov;
-	msg.desc = desc;
-	msg.iov_count = count;
-	msg.addr = dest_addr;
-	msg.context = context;
+	struct fi_msg msg = {0};
 
+	rxr_setup_msg(&msg, iov, desc, count, dest_addr, context, 0);
 	rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid);
 	return rxr_msg_sendmsg(ep, &msg, rxr_tx_flags(rxr_ep));
 }
@@ -218,7 +276,7 @@ ssize_t rxr_msg_send(struct fid_ep *ep, const void *buf, size_t len,
 
 	iov.iov_base = (void *)buf;
 	iov.iov_len = len;
-	return rxr_msg_sendv(ep, &iov, desc, 1, dest_addr, context);
+	return rxr_msg_sendv(ep, &iov, &desc, 1, dest_addr, context);
 }
 
 static
@@ -226,21 +284,14 @@ ssize_t rxr_msg_senddata(struct fid_ep *ep, const void *buf, size_t len,
 			 void *desc, uint64_t data, fi_addr_t dest_addr,
 			 void *context)
 {
-	struct fi_msg msg;
+	struct fi_msg msg = {0};
 	struct iovec iov;
 	struct rxr_ep *rxr_ep;
 
 	iov.iov_base = (void *)buf;
 	iov.iov_len = len;
 
-	memset(&msg, 0, sizeof(msg));
-	msg.msg_iov = &iov;
-	msg.desc = desc;
-	msg.iov_count = 1;
-	msg.addr = dest_addr;
-	msg.context = context;
-	msg.data = data;
-
+	rxr_setup_msg(&msg, &iov, &desc, 1, dest_addr, context, data);
 	rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid);
 	return rxr_msg_generic_send(ep, &msg, 0, ofi_op_msg,
 				    rxr_tx_flags(rxr_ep) | FI_REMOTE_CQ_DATA);
@@ -251,17 +302,13 @@ ssize_t rxr_msg_inject(struct fid_ep *ep, const void *buf, size_t len,
 		       fi_addr_t dest_addr)
 {
 	struct rxr_ep *rxr_ep;
-	struct fi_msg msg;
+	struct fi_msg msg = {0};
 	struct iovec iov;
 
 	iov.iov_base = (void *)buf;
 	iov.iov_len = len;
 
-	memset(&msg, 0, sizeof(msg));
-	msg.msg_iov = &iov;
-	msg.iov_count = 1;
-	msg.addr = dest_addr;
-
+	rxr_setup_msg(&msg, &iov, NULL, 1, dest_addr, NULL, 0);
 	rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid);
 	assert(len <= rxr_ep->core_inject_size - sizeof(struct rxr_eager_msgrtm_hdr));
 
@@ -281,12 +328,7 @@ ssize_t rxr_msg_injectdata(struct fid_ep *ep, const void *buf,
 	iov.iov_base = (void *)buf;
 	iov.iov_len = len;
 
-	memset(&msg, 0, sizeof(msg));
-	msg.msg_iov = &iov;
-	msg.iov_count = 1;
-	msg.addr = dest_addr;
-	msg.data = data;
-
+	rxr_setup_msg(&msg, &iov, NULL, 1, dest_addr, NULL, data);
 	rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid);
 	/*
 	 * We advertise the largest possible inject size with no cq data or
@@ -306,14 +348,9 @@ static
 ssize_t rxr_msg_tsendmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *tmsg,
 			 uint64_t flags)
 {
-	struct fi_msg msg;
+	struct fi_msg msg = {0};
 
-	msg.msg_iov = tmsg->msg_iov;
-	msg.desc = tmsg->desc;
-	msg.iov_count = tmsg->iov_count;
-	msg.addr = tmsg->addr;
-	msg.context = tmsg->context;
-	msg.data = tmsg->data;
+	rxr_setup_msg(&msg, tmsg->msg_iov, tmsg->desc, tmsg->iov_count, tmsg->addr, tmsg->context, tmsg->data);
 	return rxr_msg_generic_send(ep_fid, &msg, tmsg->tag, ofi_op_tagged, flags);
 }
 
@@ -323,9 +360,8 @@ ssize_t rxr_msg_tsendv(struct fid_ep *ep_fid, const struct iovec *iov,
 		       uint64_t tag, void *context)
 {
 	struct rxr_ep *rxr_ep;
-	struct fi_msg_tagged msg;
+	struct fi_msg_tagged msg = {0};
 
-	memset(&msg, 0, sizeof(msg));
 	msg.msg_iov = iov;
 	msg.desc = desc;
 	msg.iov_count = count;
@@ -355,20 +391,14 @@ ssize_t rxr_msg_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t len,
 			  void *desc, uint64_t data, fi_addr_t dest_addr,
 			  uint64_t tag, void *context)
 {
-	struct fi_msg msg;
+	struct fi_msg msg = {0};
 	struct iovec iov;
 	struct rxr_ep *rxr_ep;
 
 	iov.iov_base = (void *)buf;
 	iov.iov_len = len;
 
-	msg.msg_iov = &iov;
-	msg.desc = desc;
-	msg.iov_count = 1;
-	msg.addr = dest_addr;
-	msg.context = context;
-	msg.data = data;
-
+	rxr_setup_msg(&msg, &iov, &desc, 1, dest_addr, context, data);
 	rxr_ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid);
 	return rxr_msg_generic_send(ep_fid, &msg, tag, ofi_op_tagged,
 				    rxr_tx_flags(rxr_ep) | FI_REMOTE_CQ_DATA);
@@ -379,17 +409,13 @@ ssize_t rxr_msg_tinject(struct fid_ep *ep_fid, const void *buf, size_t len,
 			fi_addr_t dest_addr, uint64_t tag)
 {
 	struct rxr_ep *rxr_ep;
-	struct fi_msg msg;
+	struct fi_msg msg = {0};
 	struct iovec iov;
 
 	iov.iov_base = (void *)buf;
 	iov.iov_len = len;
 
-	memset(&msg, 0, sizeof(msg));
-	msg.msg_iov = &iov;
-	msg.iov_count = 1;
-	msg.addr = dest_addr;
-
+	rxr_setup_msg(&msg, &iov, NULL, 1, dest_addr, NULL, 0);
 	rxr_ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid);
 	assert(len <= rxr_ep->core_inject_size - sizeof(struct rxr_eager_tagrtm_hdr));
 
@@ -402,18 +428,13 @@ ssize_t rxr_msg_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
 			    uint64_t data, fi_addr_t dest_addr, uint64_t tag)
 {
 	struct rxr_ep *rxr_ep;
-	struct fi_msg msg;
+	struct fi_msg msg = {0};
 	struct iovec iov;
 
 	iov.iov_base = (void *)buf;
 	iov.iov_len = len;
 
-	memset(&msg, 0, sizeof(msg));
-	msg.msg_iov = &iov;
-	msg.iov_count = 1;
-	msg.addr = dest_addr;
-	msg.data = data;
-
+	rxr_setup_msg(&msg, &iov, NULL, 1, dest_addr, NULL, data);
 	rxr_ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid);
 	/*
 	 * We advertise the largest possible inject size with no cq data or
@@ -768,7 +789,12 @@ ssize_t rxr_msg_generic_recv(struct fid_ep *ep, const struct fi_msg *msg,
 	unexp_list = (op == ofi_op_tagged) ? &rxr_ep->rx_unexp_tagged_list :
 		     &rxr_ep->rx_unexp_list;
 
-	if (!dlist_empty(unexp_list)) {
+	/*
+	 * Attempt to match against stashed unexpected messages. This is not
+	 * applicable to the zero-copy path where unexpected messages are not
+	 * applicable, since there's no tag or address to match against.
+	 */
+	if (!dlist_empty(unexp_list) && !rxr_ep->use_zcpy_rx) {
 		ret = rxr_msg_proc_unexp_msg_list(rxr_ep, msg, tag,
 						  ignore, op, flags, NULL);
 
@@ -791,6 +817,9 @@ ssize_t rxr_msg_generic_recv(struct fid_ep *ep, const struct fi_msg *msg,
 	else
 		dlist_insert_tail(&rx_entry->entry, &rxr_ep->rx_list);
 
+	if (rxr_ep->use_zcpy_rx)
+		rxr_ep_post_buf(rxr_ep, msg, flags, EFA_EP);
+
 out:
 	fastlock_release(&rxr_ep->util_ep.lock);
 
@@ -960,20 +989,13 @@ static
 ssize_t rxr_msg_recv(struct fid_ep *ep, void *buf, size_t len,
 		     void *desc, fi_addr_t src_addr, void *context)
 {
-	struct fi_msg msg;
-	struct iovec msg_iov;
-
-	memset(&msg, 0, sizeof(msg));
-	msg_iov.iov_base = buf;
-	msg_iov.iov_len = len;
+	struct fi_msg msg = {0};
+	struct iovec iov;
 
-	msg.msg_iov = &msg_iov;
-	msg.desc = &desc;
-	msg.iov_count = 1;
-	msg.addr = src_addr;
-	msg.context = context;
-	msg.data = 0;
+	iov.iov_base = buf;
+	iov.iov_len = len;
 
+	rxr_setup_msg(&msg, &iov, &desc, 1, src_addr, context, 0);
 	return rxr_msg_recvmsg(ep, &msg, 0);
 }
 
@@ -982,16 +1004,9 @@ ssize_t rxr_msg_recvv(struct fid_ep *ep, const struct iovec *iov,
 		      void **desc, size_t count, fi_addr_t src_addr,
 		      void *context)
 {
-	struct fi_msg msg;
-
-	memset(&msg, 0, sizeof(msg));
-	msg.msg_iov = iov;
-	msg.desc = desc;
-	msg.iov_count = count;
-	msg.addr = src_addr;
-	msg.context = context;
-	msg.data = 0;
+	struct fi_msg msg = {0};
 
+	rxr_setup_msg(&msg, iov, desc, count, src_addr, context, 0);
 	return rxr_msg_recvmsg(ep, &msg, 0);
 }
 
@@ -1003,18 +1018,13 @@ ssize_t rxr_msg_trecv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc,
 		      fi_addr_t src_addr, uint64_t tag, uint64_t ignore,
 		      void *context)
 {
-	struct fi_msg msg;
-	struct iovec msg_iov;
-
-	msg_iov.iov_base = (void *)buf;
-	msg_iov.iov_len = len;
+	struct fi_msg msg = {0};
+	struct iovec iov;
 
-	msg.msg_iov = &msg_iov;
-	msg.iov_count = 1;
-	msg.addr = src_addr;
-	msg.context = context;
-	msg.desc = &desc;
+	iov.iov_base = (void *)buf;
+	iov.iov_len = len;
 
+	rxr_setup_msg(&msg, &iov, &desc, 1, src_addr, context, 0);
 	return rxr_msg_generic_recv(ep_fid, &msg, tag, ignore, ofi_op_tagged, 0);
 }
 
@@ -1023,39 +1033,29 @@ ssize_t rxr_msg_trecvv(struct fid_ep *ep_fid, const struct iovec *iov,
 		       void **desc, size_t count, fi_addr_t src_addr,
 		       uint64_t tag, uint64_t ignore, void *context)
 {
-	struct fi_msg msg;
-
-	msg.msg_iov = iov;
-	msg.iov_count = count;
-	msg.addr = src_addr;
-	msg.desc = desc;
-	msg.context = context;
+	struct fi_msg msg = {0};
 
+	rxr_setup_msg(&msg, iov, desc, count, src_addr, context, 0);
 	return rxr_msg_generic_recv(ep_fid, &msg, tag, ignore, ofi_op_tagged, 0);
 }
 
 static
-ssize_t rxr_msg_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *tagmsg,
+ssize_t rxr_msg_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *tmsg,
 			 uint64_t flags)
 {
 	ssize_t ret;
-	struct fi_msg msg;
+	struct fi_msg msg = {0};
 
 	if (flags & FI_PEEK) {
-		ret = rxr_msg_peek_trecv(ep_fid, tagmsg, flags);
+		ret = rxr_msg_peek_trecv(ep_fid, tmsg, flags);
 		goto out;
 	} else if (flags & FI_CLAIM) {
-		ret = rxr_msg_claim_trecv(ep_fid, tagmsg, flags);
+		ret = rxr_msg_claim_trecv(ep_fid, tmsg, flags);
 		goto out;
 	}
 
-	msg.msg_iov = tagmsg->msg_iov;
-	msg.iov_count = tagmsg->iov_count;
-	msg.addr = tagmsg->addr;
-	msg.desc = tagmsg->desc;
-	msg.context = tagmsg->context;
-
-	ret = rxr_msg_generic_recv(ep_fid, &msg, tagmsg->tag, tagmsg->ignore,
+	rxr_setup_msg(&msg, tmsg->msg_iov, tmsg->desc, tmsg->iov_count, tmsg->addr, tmsg->context, tmsg->data);
+	ret = rxr_msg_generic_recv(ep_fid, &msg, tmsg->tag, tmsg->ignore,
 				   ofi_op_tagged, flags);
 
 out:
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.c b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.c
index 1dfa4c3e33ce17418464548a38697c10b770e304..a6398574461739104c715228f7ef7e1e74116423 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.c
@@ -33,8 +33,13 @@
 
 #include "efa.h"
 #include "rxr.h"
+#include "rxr_msg.h"
 #include "rxr_cntr.h"
-#include "efa_cuda.h"
+#include "rxr_read.h"
+#include "rxr_pkt_cmd.h"
+
+/* Handshake wait timeout in microseconds */
+#define RXR_HANDSHAKE_WAIT_TIMEOUT 1000000
 
 /* This file implements 4 actions that can be applied to a packet:
  *          posting,
@@ -52,6 +57,12 @@ ssize_t rxr_pkt_post_data(struct rxr_ep *rxr_ep,
 	struct rxr_pkt_entry *pkt_entry;
 	struct rxr_data_pkt *data_pkt;
 	ssize_t ret;
+	struct efa_domain *efa_domain;
+	struct rxr_domain *rxr_domain = rxr_ep_domain(rxr_ep);
+
+	efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
+				  util_domain.domain_fid);
+
 
 	pkt_entry = rxr_pkt_entry_alloc(rxr_ep, rxr_ep->tx_pkt_efa_pool);
 	if (OFI_UNLIKELY(!pkt_entry))
@@ -80,7 +91,7 @@ ssize_t rxr_pkt_post_data(struct rxr_ep *rxr_ep,
 	 * For now, always send CUDA buffers through
 	 * rxr_pkt_send_data_desc().
 	 */
-	if (efa_mr_cache_enable || rxr_ep_is_cuda_mr(tx_entry->desc[0]))
+	if (efa_is_cache_available(efa_domain) || efa_ep_is_cuda_mr(tx_entry->desc[0]))
 		ret = rxr_pkt_send_data_desc(rxr_ep, tx_entry, pkt_entry);
 	else
 		ret = rxr_pkt_send_data(rxr_ep, tx_entry, pkt_entry);
@@ -241,6 +252,7 @@ void rxr_pkt_handle_ctrl_sent(struct rxr_ep *rxr_ep, struct rxr_pkt_entry *pkt_e
 ssize_t rxr_pkt_post_ctrl_once(struct rxr_ep *rxr_ep, int entry_type, void *x_entry,
 			       int ctrl_type, bool inject)
 {
+	struct rxr_pkt_sendv send;
 	struct rxr_pkt_entry *pkt_entry;
 	struct rxr_tx_entry *tx_entry;
 	struct rxr_rx_entry *rx_entry;
@@ -267,6 +279,12 @@ ssize_t rxr_pkt_post_ctrl_once(struct rxr_ep *rxr_ep, int entry_type, void *x_en
 	if (!pkt_entry)
 		return -FI_EAGAIN;
 
+	send.iov_count = 0;
+	pkt_entry->send = &send;
+
+	/*
+	 * rxr_pkt_init_ctrl will set pkt_entry->send if it want to use multi iov
+	 */
 	err = rxr_pkt_init_ctrl(rxr_ep, entry_type, x_entry, ctrl_type, pkt_entry);
 	if (OFI_UNLIKELY(err)) {
 		rxr_pkt_entry_release_tx(rxr_ep, pkt_entry);
@@ -279,13 +297,14 @@ ssize_t rxr_pkt_post_ctrl_once(struct rxr_ep *rxr_ep, int entry_type, void *x_en
 	 */
 	if (inject)
 		err = rxr_pkt_entry_inject(rxr_ep, pkt_entry, addr);
-	else if (pkt_entry->iov_count > 0)
+	else if (pkt_entry->send->iov_count > 0)
 		err = rxr_pkt_entry_sendv(rxr_ep, pkt_entry, addr,
-					  pkt_entry->iov, pkt_entry->desc,
-					  pkt_entry->iov_count, 0);
+					  pkt_entry->send->iov, pkt_entry->send->desc,
+					  pkt_entry->send->iov_count, 0);
 	else
 		err = rxr_pkt_entry_send(rxr_ep, pkt_entry, addr);
 
+	pkt_entry->send = NULL;
 	if (OFI_UNLIKELY(err)) {
 		rxr_pkt_entry_release_tx(rxr_ep, pkt_entry);
 		return err;
@@ -353,6 +372,190 @@ ssize_t rxr_pkt_post_ctrl_or_queue(struct rxr_ep *ep, int entry_type, void *x_en
 	return err;
 }
 
+/*
+ * This function is used for any extra feature that does not have an alternative.
+ *
+ * This function will send a eager rtw packet to trigger handshake.
+ *
+ * We do not send eager rtm packets here because the receiver might require
+ * ordering and an extra eager rtm will interrupt the reorder
+ * process.
+ *
+ * ep: The endpoint on which the packet for triggering handshake will be sent.
+ * peer: The peer from which the sender receives handshake.
+ * addr: The address of the peer.
+ *
+ * This function will return 0 if sender successfully receives / have already
+ * received the handshake from the peer
+ *
+ * This function will return FI_EAGAIN if it fails to allocate or send the trigger packet.
+ * It will return FI_ETIMEDOUT if it fails to receive
+ * handshake packet within a certain period of time.
+ */
+
+ssize_t rxr_pkt_wait_handshake(struct rxr_ep *ep, fi_addr_t addr, struct rxr_peer *peer)
+{
+	struct rxr_tx_entry *tx_entry;
+	ssize_t err;
+
+	uint64_t current, endwait;
+
+	if (peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)
+		return 0;
+
+	tx_entry = ofi_buf_alloc(ep->tx_entry_pool);
+	if (OFI_UNLIKELY(!tx_entry)) {
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "TX entries exhausted.\n");
+		return -FI_EAGAIN;
+	}
+
+	tx_entry->total_len = 0;
+	tx_entry->addr = addr;
+	tx_entry->msg_id = -1;
+	tx_entry->cq_entry.flags = FI_RMA | FI_WRITE;
+	tx_entry->cq_entry.buf = NULL;
+	dlist_init(&tx_entry->queued_pkts);
+
+	tx_entry->type = RXR_TX_ENTRY;
+	tx_entry->op = ofi_op_write;
+	tx_entry->state = RXR_TX_REQ;
+
+	tx_entry->send_flags = 0;
+	tx_entry->bytes_acked = 0;
+	tx_entry->bytes_sent = 0;
+	tx_entry->window = 0;
+	tx_entry->rma_iov_count = 0;
+	tx_entry->iov_count = 0;
+	tx_entry->iov_index = 0;
+	tx_entry->iov_mr_start = 0;
+	tx_entry->iov_offset = 0;
+	tx_entry->fi_flags = RXR_NO_COMPLETION | RXR_NO_COUNTER;
+
+#if ENABLE_DEBUG
+	dlist_insert_tail(&tx_entry->tx_entry_entry, &ep->tx_entry_list);
+#endif
+
+	err = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, RXR_EAGER_RTW_PKT, 0);
+
+	if (OFI_UNLIKELY(err))
+		return err;
+
+	current = ofi_gettime_us();
+	endwait = current + RXR_HANDSHAKE_WAIT_TIMEOUT;
+	while (current < endwait && !(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)) {
+		rxr_ep_progress_internal(ep);
+		current = ofi_gettime_us();
+	}
+
+	if (!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)) {
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL,
+			"did not get handshake back in %f second(s). returning -FI_EAGAIN!\n",
+			RXR_HANDSHAKE_WAIT_TIMEOUT*1e-6);
+		return -FI_EAGAIN;
+	}
+
+	return 0;
+}
+
+/* return the data size in a packet entry */
+size_t rxr_pkt_data_size(struct rxr_pkt_entry *pkt_entry)
+{
+	int pkt_type;
+
+	assert(pkt_entry);
+	pkt_type = rxr_get_base_hdr(pkt_entry->pkt)->type;
+
+	if (pkt_type == RXR_DATA_PKT)
+		return pkt_entry->pkt_size - sizeof(struct rxr_data_hdr);
+
+	if (pkt_type == RXR_READRSP_PKT)
+		return pkt_entry->pkt_size - sizeof(struct rxr_readrsp_hdr);
+
+	if (pkt_type >= RXR_REQ_PKT_BEGIN) {
+		assert(pkt_type == RXR_EAGER_MSGRTM_PKT || pkt_type == RXR_EAGER_TAGRTM_PKT ||
+		       pkt_type == RXR_MEDIUM_MSGRTM_PKT || pkt_type == RXR_MEDIUM_TAGRTM_PKT ||
+		       pkt_type == RXR_LONG_MSGRTM_PKT || pkt_type == RXR_LONG_TAGRTM_PKT ||
+		       pkt_type == RXR_EAGER_RTW_PKT || pkt_type == RXR_LONG_RTW_PKT);
+
+		return pkt_entry->pkt_size - rxr_pkt_req_hdr_size(pkt_entry);
+	}
+
+	/* other packet type does not contain data, thus return 0
+	 */
+	return 0;
+}
+
+/*
+ * rxr_pkt_copy_to_rx() copy data to receiving buffer then
+ * update counter in rx_entry.
+ *
+ * If receiving buffer is on GPU memory, it will post a
+ * read request, otherwise it will copy data.
+ *
+ * If all data has been copied to receiving buffer,
+ * it will write rx completion and release rx_entry.
+ *
+ * Return value and states:
+ *
+ *    On success, return 0 and release pkt_entry
+ *    On failure, return error code
+ */
+ssize_t rxr_pkt_copy_to_rx(struct rxr_ep *ep,
+			   struct rxr_rx_entry *rx_entry,
+			   size_t data_offset,
+			   struct rxr_pkt_entry *pkt_entry,
+			   char *data, size_t data_size)
+{
+	ssize_t err, bytes_copied;
+
+	pkt_entry->x_entry = rx_entry;
+
+	if (data_size > 0 && efa_ep_is_cuda_mr(rx_entry->desc[0])) {
+		err = rxr_read_post_local_read_or_queue(ep, rx_entry, data_offset,
+							pkt_entry, data, data_size);
+		if (err)
+			FI_WARN(&rxr_prov, FI_LOG_CQ, "cannot post read to copy data\n");
+
+		return err;
+	}
+
+	if (OFI_LIKELY(!(rx_entry->rxr_flags & RXR_RECV_CANCEL)) &&
+	    rx_entry->cq_entry.len > data_offset && data_size > 0) {
+		bytes_copied = ofi_copy_to_iov(rx_entry->iov,
+					       rx_entry->iov_count,
+					       data_offset,
+					       data,
+					       data_size);
+		if (bytes_copied != MIN(data_size, rx_entry->cq_entry.len - data_offset)) {
+			FI_WARN(&rxr_prov, FI_LOG_CQ, "wrong size! bytes_copied: %ld\n",
+				bytes_copied);
+			return -FI_EINVAL;
+		}
+	}
+
+	rxr_pkt_handle_data_copied(ep, pkt_entry, data_size);
+	return 0;
+}
+
+void rxr_pkt_handle_data_copied(struct rxr_ep *ep,
+				struct rxr_pkt_entry *pkt_entry,
+				size_t data_size)
+{
+	struct rxr_rx_entry *rx_entry;
+
+	rx_entry = pkt_entry->x_entry;
+	assert(rx_entry);
+	rx_entry->bytes_copied += data_size;
+
+	if (rx_entry->total_len == rx_entry->bytes_copied) {
+		rxr_cq_handle_rx_completion(ep, pkt_entry, rx_entry);
+		rxr_msg_multi_recv_free_posted_entry(ep, rx_entry);
+		rxr_release_rx_entry(ep, rx_entry);
+	} else {
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
+	}
+}
+
 /*
  *   Functions used to handle packet send completion
  */
@@ -389,7 +592,7 @@ void rxr_pkt_handle_send_completion(struct rxr_ep *ep, struct fi_cq_data_entry *
 		break;
 	case RXR_MEDIUM_MSGRTM_PKT:
 	case RXR_MEDIUM_TAGRTM_PKT:
-		rxr_pkt_handle_long_rtm_send_completion(ep, pkt_entry);
+		rxr_pkt_handle_medium_rtm_send_completion(ep, pkt_entry);
 		break;
 	case RXR_LONG_MSGRTM_PKT:
 	case RXR_LONG_TAGRTM_PKT:
@@ -440,10 +643,9 @@ void rxr_pkt_handle_send_completion(struct rxr_ep *ep, struct fi_cq_data_entry *
  *  Functions used to handle packet receive completion
  */
 static
-fi_addr_t rxr_pkt_insert_addr(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
+fi_addr_t rxr_pkt_insert_addr(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry, void *raw_addr)
 {
 	int i, ret;
-	void *raw_addr;
 	fi_addr_t rdm_addr;
 	struct efa_ep *efa_ep;
 	struct rxr_base_hdr *base_hdr;
@@ -468,7 +670,6 @@ fi_addr_t rxr_pkt_insert_addr(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry
 	}
 
 	assert(base_hdr->type >= RXR_REQ_PKT_BEGIN);
-	raw_addr = pkt_entry->raw_addr;
 
 	efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid);
 	ret = efa_av_insert_addr(efa_ep->av, (struct efa_ep_addr *)raw_addr,
@@ -505,15 +706,16 @@ void rxr_pkt_handle_recv_completion(struct rxr_ep *ep,
 	}
 
 	if (base_hdr->type >= RXR_REQ_PKT_BEGIN) {
-		rxr_pkt_proc_req_common_hdr(pkt_entry);
-		assert(pkt_entry->hdr_size > 0);
 		/*
 		 * as long as the REQ packet contain raw address
 		 * we will need to call insert because it might be a new
 		 * EP with new Q-Key.
 		 */
-		if (OFI_UNLIKELY(pkt_entry->raw_addr != NULL))
-			pkt_entry->addr = rxr_pkt_insert_addr(ep, pkt_entry);
+		void *raw_addr;
+
+		raw_addr = rxr_pkt_req_raw_addr(pkt_entry);
+		if (OFI_UNLIKELY(raw_addr != NULL))
+			pkt_entry->addr = rxr_pkt_insert_addr(ep, pkt_entry, raw_addr);
 		else
 			pkt_entry->addr = src_addr;
 	} else {
@@ -522,10 +724,12 @@ void rxr_pkt_handle_recv_completion(struct rxr_ep *ep,
 	}
 
 #if ENABLE_DEBUG
-	dlist_remove(&pkt_entry->dbg_entry);
-	dlist_insert_tail(&pkt_entry->dbg_entry, &ep->rx_pkt_list);
+	if (!ep->use_zcpy_rx) {
+		dlist_remove(&pkt_entry->dbg_entry);
+		dlist_insert_tail(&pkt_entry->dbg_entry, &ep->rx_pkt_list);
+	}
 #ifdef ENABLE_RXR_PKT_DUMP
-	rxr_ep_print_pkt("Received", ep, (struct rxr_base_hdr *)pkt_entry->pkt);
+	rxr_pkt_print("Received", ep, (struct rxr_base_hdr *)pkt_entry->pkt);
 #endif
 #endif
 	peer = rxr_ep_get_peer(ep, pkt_entry->addr);
@@ -571,6 +775,11 @@ void rxr_pkt_handle_recv_completion(struct rxr_ep *ep,
 		rxr_pkt_handle_atomrsp_recv(ep, pkt_entry);
 		return;
 	case RXR_EAGER_MSGRTM_PKT:
+		if (ep->use_zcpy_rx && pkt_entry->type == RXR_PKT_ENTRY_USER)
+			rxr_pkt_handle_zcpy_recv(ep, pkt_entry);
+		else
+			rxr_pkt_handle_rtm_rta_recv(ep, pkt_entry);
+		return;
 	case RXR_EAGER_TAGRTM_PKT:
 	case RXR_MEDIUM_MSGRTM_PKT:
 	case RXR_MEDIUM_TAGRTM_PKT:
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.h b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.h
index 419b14fc6da4ca6eedcf723a2a8e3b032e308f66..eb5d05d0e2d5a0debc3029f571f9224abfaa95f5 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_cmd.h
@@ -44,6 +44,18 @@ ssize_t rxr_pkt_post_ctrl(struct rxr_ep *ep, int entry_type, void *x_entry,
 ssize_t rxr_pkt_post_ctrl_or_queue(struct rxr_ep *ep, int entry_type, void *x_entry,
 				   int ctrl_type, bool inject);
 
+size_t rxr_pkt_data_size(struct rxr_pkt_entry *pkt_entry);
+
+ssize_t rxr_pkt_copy_to_rx(struct rxr_ep *ep,
+			   struct rxr_rx_entry *rx_entry,
+			   size_t data_offset,
+			   struct rxr_pkt_entry *pkt_entry,
+			   char *data, size_t data_size);
+
+void rxr_pkt_handle_data_copied(struct rxr_ep *ep,
+				struct rxr_pkt_entry *pkt_entry,
+				size_t data_size);
+
 void rxr_pkt_handle_send_completion(struct rxr_ep *ep,
 				    struct fi_cq_data_entry *cq_entry);
 
@@ -51,6 +63,8 @@ void rxr_pkt_handle_recv_completion(struct rxr_ep *ep,
 				    struct fi_cq_data_entry *cq_entry,
 				    fi_addr_t src_addr);
 
+ssize_t rxr_pkt_wait_handshake(struct rxr_ep *ep, fi_addr_t addr, struct rxr_peer *peer);
+
 #if ENABLE_DEBUG
 void rxr_pkt_print(char *prefix,
 		   struct rxr_ep *ep,
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.c b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.c
index db2f7188ad3b6fb528af45a9979ab61e1b8f8ccb..5ed475bd15cc1db6ec9a29316b278ba1fe2d8806 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.c
@@ -42,10 +42,46 @@
 #include "efa.h"
 #include "rxr_msg.h"
 #include "rxr_rma.h"
+#include "rxr_pkt_cmd.h"
 
 /*
  *   General purpose utility functions
  */
+
+struct rxr_pkt_entry *rxr_pkt_entry_init_prefix(struct rxr_ep *ep,
+						const struct fi_msg *posted_buf,
+						struct ofi_bufpool *pkt_pool)
+{
+	struct rxr_pkt_entry *pkt_entry;
+	struct efa_mr *mr;
+
+	/*
+	 * Given the pkt_entry->pkt immediately follows the pkt_entry
+	 * fields, we can directly map the user-provided fi_msg address
+	 * as the pkt_entry, which will hold the metadata in the prefix.
+	 */
+	assert(posted_buf->msg_iov->iov_len >= sizeof(struct rxr_pkt_entry) + sizeof(struct rxr_eager_msgrtm_hdr));
+	pkt_entry = (struct rxr_pkt_entry *) posted_buf->msg_iov->iov_base;
+	if (!pkt_entry)
+		return NULL;
+
+	/*
+	 * The ownership of the prefix buffer lies with the application, do not
+	 * put it on the dbg list for cleanup during shutdown or poison it. The
+	 * provider loses jurisdiction over it soon after writing the rx
+	 * completion.
+	 */
+	dlist_init(&pkt_entry->entry);
+	mr = (struct efa_mr *) posted_buf->desc[0];
+	pkt_entry->mr = &mr->mr_fid;
+
+	pkt_entry->type = RXR_PKT_ENTRY_USER;
+	pkt_entry->state = RXR_PKT_ENTRY_IN_USE;
+	pkt_entry->next = NULL;
+
+	return pkt_entry;
+}
+
 struct rxr_pkt_entry *rxr_pkt_entry_alloc(struct rxr_ep *ep,
 					  struct ofi_bufpool *pkt_pool)
 {
@@ -55,6 +91,7 @@ struct rxr_pkt_entry *rxr_pkt_entry_alloc(struct rxr_ep *ep,
 	pkt_entry = ofi_buf_alloc_ex(pkt_pool, &mr);
 	if (!pkt_entry)
 		return NULL;
+
 #ifdef ENABLE_EFA_POISONING
 	memset(pkt_entry, 0, sizeof(*pkt_entry));
 #endif
@@ -62,15 +99,14 @@ struct rxr_pkt_entry *rxr_pkt_entry_alloc(struct rxr_ep *ep,
 #if ENABLE_DEBUG
 	dlist_init(&pkt_entry->dbg_entry);
 #endif
-	pkt_entry->mr = (struct fid_mr *)mr;
-	pkt_entry->pkt = (struct rxr_pkt *)((char *)pkt_entry +
-			  sizeof(*pkt_entry));
+	pkt_entry->mr = (struct fid_mr *) mr;
 #ifdef ENABLE_EFA_POISONING
 	memset(pkt_entry->pkt, 0, ep->mtu_size);
 #endif
+	pkt_entry->type = RXR_PKT_ENTRY_POSTED;
 	pkt_entry->state = RXR_PKT_ENTRY_IN_USE;
-	pkt_entry->iov_count = 0;
 	pkt_entry->next = NULL;
+
 	return pkt_entry;
 }
 
@@ -118,10 +154,24 @@ void rxr_pkt_entry_release_tx(struct rxr_ep *ep,
 	}
 }
 
-static
-void rxr_pkt_entry_release_single_rx(struct rxr_ep *ep,
-				     struct rxr_pkt_entry *pkt_entry)
+/*
+ * rxr_pkt_entry_release_rx() release a rx packet entry.
+ * It requires input pkt_entry to be unlinked.
+ *
+ * RX packet entry can be linked when medium message protocol
+ * is used.
+ *
+ * In that case, caller is responsible to unlink the pkt_entry
+ * can call this function on next packet entry.
+ */
+void rxr_pkt_entry_release_rx(struct rxr_ep *ep,
+			      struct rxr_pkt_entry *pkt_entry)
 {
+	assert(pkt_entry->next == NULL);
+
+	if (ep->use_zcpy_rx && pkt_entry->type == RXR_PKT_ENTRY_USER)
+		return;
+
 	if (pkt_entry->type == RXR_PKT_ENTRY_POSTED) {
 		struct rxr_peer *peer;
 
@@ -132,6 +182,12 @@ void rxr_pkt_entry_release_single_rx(struct rxr_ep *ep,
 		else
 			ep->rx_bufs_efa_to_post++;
 	}
+
+	if (pkt_entry->type == RXR_PKT_ENTRY_READ_COPY) {
+		assert(ep->rx_readcopy_pkt_pool_used > 0);
+		ep->rx_readcopy_pkt_pool_used--;
+	}
+
 #if ENABLE_DEBUG
 	dlist_remove(&pkt_entry->dbg_entry);
 #endif
@@ -143,36 +199,29 @@ void rxr_pkt_entry_release_single_rx(struct rxr_ep *ep,
 	ofi_buf_free(pkt_entry);
 }
 
-void rxr_pkt_entry_release_rx(struct rxr_ep *ep,
-			      struct rxr_pkt_entry *pkt_entry)
-{
-	struct rxr_pkt_entry *next;
-
-	while (pkt_entry) {
-		next = pkt_entry->next;
-		rxr_pkt_entry_release_single_rx(ep, pkt_entry);
-		pkt_entry = next;
-	}
-}
-
-static
 void rxr_pkt_entry_copy(struct rxr_ep *ep,
 			struct rxr_pkt_entry *dest,
 			struct rxr_pkt_entry *src,
 			int new_entry_type)
 {
 	FI_DBG(&rxr_prov, FI_LOG_EP_CTRL,
-	       "Copying packet out of posted buffer\n");
-	assert(src->type == RXR_PKT_ENTRY_POSTED);
-	memcpy(dest, src, sizeof(struct rxr_pkt_entry));
-	dest->pkt = (struct rxr_pkt *)((char *)dest + sizeof(*dest));
-	memcpy(dest->pkt, src->pkt, ep->mtu_size);
+	       "Copying packet out of posted buffer! src_entry_type: %d new_entry_type: %d\n",
+		src->type, new_entry_type);
 	dlist_init(&dest->entry);
 #if ENABLE_DEBUG
 	dlist_init(&dest->dbg_entry);
 #endif
-	dest->state = RXR_PKT_ENTRY_IN_USE;
+	/* dest->mr was set in rxr_pkt_entry_alloc(), and
+	 * is tied to the memory region, therefore should
+	 * not be changed.
+	 */
+	dest->x_entry = src->x_entry;
+	dest->pkt_size = src->pkt_size;
+	dest->addr = src->addr;
 	dest->type = new_entry_type;
+	dest->state = RXR_PKT_ENTRY_IN_USE;
+	dest->next = NULL;
+	memcpy(dest->pkt, src->pkt, ep->mtu_size);
 }
 
 /*
@@ -227,12 +276,20 @@ struct rxr_pkt_entry *rxr_pkt_entry_clone(struct rxr_ep *ep,
 
 	assert(src);
 	assert(new_entry_type == RXR_PKT_ENTRY_OOO ||
-	       new_entry_type == RXR_PKT_ENTRY_UNEXP);
+	       new_entry_type == RXR_PKT_ENTRY_UNEXP ||
+	       new_entry_type == RXR_PKT_ENTRY_READ_COPY);
 
 	dst = rxr_pkt_entry_alloc(ep, pkt_pool);
 	if (!dst)
 		return NULL;
 
+	if (new_entry_type == RXR_PKT_ENTRY_READ_COPY) {
+		assert(pkt_pool == ep->rx_readcopy_pkt_pool);
+		ep->rx_readcopy_pkt_pool_used++;
+		ep->rx_readcopy_pkt_pool_max_used = MAX(ep->rx_readcopy_pkt_pool_used,
+							ep->rx_readcopy_pkt_pool_max_used);
+	}
+
 	rxr_pkt_entry_copy(ep, dst, src, new_entry_type);
 	root = dst;
 	while (src->next) {
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.h b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.h
index 57c84013711d0b81a082879a0494ca41f0f183da..85173ffdf0608fd107cc6f394c3c481cdf545dc9 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_entry.h
@@ -45,26 +45,14 @@ enum rxr_pkt_entry_state {
 
 /* pkt_entry types for rx pkts */
 enum rxr_pkt_entry_type {
-	RXR_PKT_ENTRY_POSTED = 1,   /* entries that are posted to the core */
+	RXR_PKT_ENTRY_POSTED = 1,   /* entries that are posted to the device from the RX bufpool */
 	RXR_PKT_ENTRY_UNEXP,        /* entries used to stage unexpected msgs */
-	RXR_PKT_ENTRY_OOO	    /* entries used to stage out-of-order RTM or RTA */
+	RXR_PKT_ENTRY_OOO,	    /* entries used to stage out-of-order RTM or RTA */
+	RXR_PKT_ENTRY_USER,	    /* entries backed by user-provided msg prefix (FI_MSG_PREFIX)*/
+	RXR_PKT_ENTRY_READ_COPY,    /* entries used to stage copy by read */
 };
 
-struct rxr_pkt_entry {
-	/* for rx/tx_entry queued_pkts list */
-	struct dlist_entry entry;
-#if ENABLE_DEBUG
-	/* for tx/rx debug list or posted buf list */
-	struct dlist_entry dbg_entry;
-#endif
-	void *x_entry; /* pointer to rxr rx/tx entry */
-	size_t pkt_type;
-	size_t pkt_size;
-
-	size_t hdr_size;
-	void *raw_addr;
-	uint64_t cq_data;
-
+struct rxr_pkt_sendv {
 	/* Because core EP current only support 2 iov,
 	 * and for the sake of code simplicity, we use 2 iov.
 	 * One for header, and the other for data.
@@ -74,28 +62,50 @@ struct rxr_pkt_entry {
 	int iov_count;
 	struct iovec iov[2];
 	void *desc[2];
+};
+
+struct rxr_pkt_entry {
+	/* for rx/tx_entry queued_pkts list */
+	struct dlist_entry entry;
+#if ENABLE_DEBUG
+	/* for tx/rx debug list or posted buf list */
+	struct dlist_entry dbg_entry;
+#endif
+	void *x_entry; /* pointer to rxr rx/tx entry */
+	size_t pkt_size;
 
 	struct fid_mr *mr;
 	fi_addr_t addr;
-	void *pkt; /* rxr_ctrl_*_pkt, or rxr_data_pkt */
 	enum rxr_pkt_entry_type type;
 	enum rxr_pkt_entry_state state;
-	struct rxr_pkt_entry *next;
+
+	/*
+	 * next is used on receiving end.
+	 * send is used on sending end.
+	 */
+	union {
+		struct rxr_pkt_entry *next;
+		struct rxr_pkt_sendv *send;
+	};
+
 #if ENABLE_DEBUG
-/* pad to cache line size of 64 bytes */
-	uint8_t pad[16];
-#else
-	uint8_t pad[32];
+	/* pad to cache line size of 64 bytes */
+	uint8_t pad[48];
 #endif
+	char pkt[0]; /* rxr_ctrl_*_pkt, or rxr_data_pkt */
 };
 
 static inline void *rxr_pkt_start(struct rxr_pkt_entry *pkt_entry)
 {
-	return (void *)((char *)pkt_entry + sizeof(*pkt_entry));
+	return pkt_entry->pkt;
 }
 
 #if defined(static_assert) && defined(__x86_64__)
-static_assert(sizeof(struct rxr_pkt_entry) == 192, "rxr_pkt_entry check");
+#if ENABLE_DEBUG
+static_assert(sizeof(struct rxr_pkt_entry) == 128, "rxr_pkt_entry check");
+#else
+static_assert(sizeof(struct rxr_pkt_entry) == 64, "rxr_pkt_entry check");
+#endif
 #endif
 
 OFI_DECL_RECVWIN_BUF(struct rxr_pkt_entry*, rxr_robuf, uint32_t);
@@ -105,6 +115,10 @@ struct rxr_ep;
 
 struct rxr_tx_entry;
 
+struct rxr_pkt_entry *rxr_pkt_entry_init_prefix(struct rxr_ep *ep,
+						const struct fi_msg *posted_buf,
+						struct ofi_bufpool *pkt_pool);
+
 struct rxr_pkt_entry *rxr_pkt_entry_alloc(struct rxr_ep *ep,
 					  struct ofi_bufpool *pkt_pool);
 
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type.h b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type.h
index 58172dfd07acf3864f399d0f187ed990ff8ece0a..18c930237fbd87841b975f51cd219e6444db8824 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type.h
@@ -234,11 +234,11 @@ ssize_t rxr_pkt_send_data_desc(struct rxr_ep *ep,
 			       struct rxr_tx_entry *tx_entry,
 			       struct rxr_pkt_entry *pkt_entry);
 
-int rxr_pkt_proc_data(struct rxr_ep *ep,
-		      struct rxr_rx_entry *rx_entry,
-		      struct rxr_pkt_entry *pkt_entry,
-		      char *data, size_t seg_offset,
-		      size_t seg_size);
+void rxr_pkt_proc_data(struct rxr_ep *ep,
+		       struct rxr_rx_entry *rx_entry,
+		       struct rxr_pkt_entry *pkt_entry,
+		       char *data, size_t seg_offset,
+		       size_t seg_size);
 
 void rxr_pkt_handle_data_send_completion(struct rxr_ep *ep,
 					 struct rxr_pkt_entry *pkt_entry);
@@ -341,14 +341,18 @@ struct rxr_eor_hdr {
 static_assert(sizeof(struct rxr_eor_hdr) == 12, "rxr_eor_hdr check");
 #endif
 
+static inline
+struct rxr_eor_hdr *rxr_get_eor_hdr(void *pkt)
+{
+	return (struct rxr_eor_hdr *)pkt;
+}
+
 int rxr_pkt_init_eor(struct rxr_ep *ep,
 		     struct rxr_rx_entry *rx_entry,
 		     struct rxr_pkt_entry *pkt_entry);
 
-static inline
-void rxr_pkt_handle_eor_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
-{
-}
+
+void rxr_pkt_handle_eor_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry);
 
 void rxr_pkt_handle_eor_send_completion(struct rxr_ep *ep,
 					struct rxr_pkt_entry *pkt_entry);
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_data.c b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_data.c
index d00b5a86f6a40580beda92ad9406dcd509bd8f02..477b530209ba82cdb4432c41d68a68ad3dc85ca7 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_data.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_data.c
@@ -31,10 +31,10 @@
  * SOFTWARE.
  */
 
+#include "efa.h"
 #include "rxr.h"
 #include "rxr_msg.h"
 #include "rxr_pkt_cmd.h"
-#include "efa_cuda.h"
 
 /*
  * This function contains data packet related functions
@@ -51,9 +51,11 @@ ssize_t rxr_pkt_send_data(struct rxr_ep *ep,
 {
 	uint64_t payload_size, copied_size;
 	struct rxr_data_pkt *data_pkt;
+	struct efa_mr *desc;
 
 	pkt_entry->x_entry = (void *)tx_entry;
 	pkt_entry->addr = tx_entry->addr;
+	desc = tx_entry->desc[0];
 
 	payload_size = MIN(tx_entry->total_len - tx_entry->bytes_sent,
 			   ep->max_data_payload_size);
@@ -62,7 +64,13 @@ ssize_t rxr_pkt_send_data(struct rxr_ep *ep,
 	data_pkt = (struct rxr_data_pkt *)pkt_entry->pkt;
 	data_pkt->hdr.seg_size = payload_size;
 
-	copied_size = rxr_copy_from_tx(data_pkt->data, payload_size, tx_entry, tx_entry->bytes_sent);
+	copied_size = ofi_copy_from_hmem_iov(data_pkt->data,
+					     payload_size,
+					     desc ? desc->peer.iface : FI_HMEM_SYSTEM,
+					     desc ? desc->peer.device.reserved : 0,
+					     tx_entry->iov,
+					     tx_entry->iov_count,
+					     tx_entry->bytes_sent);
 	assert(copied_size == payload_size);
 
 	pkt_entry->pkt_size = copied_size + sizeof(struct rxr_data_hdr);
@@ -131,6 +139,8 @@ ssize_t rxr_pkt_send_data_desc(struct rxr_ep *ep,
 	uint64_t payload_size = 0;
 	/* pkt_entry offset to write data into */
 	uint64_t pkt_used = 0;
+	uint64_t orig_iov_index;
+	uint64_t orig_iov_offset;
 	/* Remaining size that can fit in the constructed iov */
 	uint64_t remaining_len = MIN(tx_entry->window,
 				     ep->max_data_payload_size);
@@ -140,6 +150,9 @@ ssize_t rxr_pkt_send_data_desc(struct rxr_ep *ep,
 
 	ssize_t ret;
 
+	orig_iov_index = tx_entry->iov_index;
+	orig_iov_offset = tx_entry->iov_offset;
+
 	data_pkt = (struct rxr_data_pkt *)pkt_entry->pkt;
 	/* Assign packet header in constructed iov */
 	iov[i].iov_base = rxr_pkt_start(pkt_entry);
@@ -205,6 +218,11 @@ ssize_t rxr_pkt_send_data_desc(struct rxr_ep *ep,
 	ret = rxr_pkt_entry_sendv(ep, pkt_entry, tx_entry->addr,
 				  (const struct iovec *)iov,
 				  desc, i, tx_entry->send_flags);
+	if (OFI_UNLIKELY(ret)) {
+		/* Reset tx_entry iov pointer on send failure. */
+		tx_entry->iov_index = orig_iov_index;
+		tx_entry->iov_offset = orig_iov_offset;
+	}
 	return ret;
 }
 
@@ -224,35 +242,29 @@ void rxr_pkt_handle_data_send_completion(struct rxr_ep *ep,
 /*
  *  rxr_pkt_handle_data_recv() and related functions
  */
-int rxr_pkt_proc_data(struct rxr_ep *ep,
-		      struct rxr_rx_entry *rx_entry,
-		      struct rxr_pkt_entry *pkt_entry,
-		      char *data, size_t seg_offset,
-		      size_t seg_size)
+
+/*
+ * rxr_pkt_proc_data() processes data in a DATA/READRSP
+ * pakcet entry.
+ */
+void rxr_pkt_proc_data(struct rxr_ep *ep,
+		       struct rxr_rx_entry *rx_entry,
+		       struct rxr_pkt_entry *pkt_entry,
+		       char *data, size_t seg_offset,
+		       size_t seg_size)
 {
 	struct rxr_peer *peer;
-	int64_t bytes_left, bytes_copied;
-	ssize_t ret = 0;
+	bool all_received = 0;
+	ssize_t err;
 
 #if ENABLE_DEBUG
 	int pkt_type = rxr_get_base_hdr(pkt_entry->pkt)->type;
 
 	assert(pkt_type == RXR_DATA_PKT || pkt_type == RXR_READRSP_PKT);
 #endif
-	/* we are sinking message for CANCEL/DISCARD entry */
-	if (OFI_LIKELY(!(rx_entry->rxr_flags & RXR_RECV_CANCEL)) &&
-	    rx_entry->cq_entry.len > seg_offset) {
-		bytes_copied = rxr_copy_to_rx(data, seg_size, rx_entry, seg_offset);
-
-		if (bytes_copied != MIN(seg_size, rx_entry->cq_entry.len - seg_offset)) {
-			FI_WARN(&rxr_prov, FI_LOG_CQ, "wrong size! bytes_copied: %ld\n",
-				bytes_copied);
-			if (rxr_cq_handle_rx_error(ep, rx_entry, -FI_EINVAL))
-				assert(0 && "error writing error cq entry for EOR\n");
-		}
-	}
-
-	rx_entry->bytes_done += seg_size;
+	rx_entry->bytes_received += seg_size;
+	assert(rx_entry->bytes_received <= rx_entry->total_len);
+	all_received = (rx_entry->bytes_received == rx_entry->total_len);
 
 	peer = rxr_ep_get_peer(ep, rx_entry->addr);
 	peer->rx_credits += ofi_div_ceil(seg_size, ep->max_data_payload_size);
@@ -261,33 +273,34 @@ int rxr_pkt_proc_data(struct rxr_ep *ep,
 	if (ep->available_data_bufs < rxr_get_rx_pool_chunk_cnt(ep))
 		ep->available_data_bufs++;
 
-	/* bytes_done is total bytes sent/received, which could be larger than
-	 * to bytes copied to recv buffer (for truncated messages).
-	 * rx_entry->total_len is from rtm header and is the size of send buffer,
-	 * thus we always have:
-	 *             rx_entry->total >= rx_entry->bytes_done
-	 */
-	bytes_left = rx_entry->total_len - rx_entry->bytes_done;
-	assert(bytes_left >= 0);
-	if (!bytes_left) {
 #if ENABLE_DEBUG
+	/* rx_entry can be released by rxr_pkt_copy_to_rx
+	 * so the call to dlist_remove must happen before
+	 * call to rxr_copy_to_rx
+	 */
+	if (all_received) {
 		dlist_remove(&rx_entry->rx_pending_entry);
 		ep->rx_pending--;
+	}
 #endif
-		rxr_cq_handle_rx_completion(ep, pkt_entry, rx_entry);
-
-		rxr_msg_multi_recv_free_posted_entry(ep, rx_entry);
-		rxr_release_rx_entry(ep, rx_entry);
-		return 0;
+	err = rxr_pkt_copy_to_rx(ep, rx_entry, seg_offset,
+				 pkt_entry, data, seg_size);
+	if (err) {
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
+		rxr_cq_handle_rx_error(ep, rx_entry, err);
 	}
 
+	if (all_received)
+		return;
+
 	if (!rx_entry->window) {
 		assert(rx_entry->state == RXR_RX_RECV);
-		ret = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_CTS_PKT, 0);
+		err = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_CTS_PKT, 0);
+		if (err) {
+			FI_WARN(&rxr_prov, FI_LOG_CQ, "post CTS packet failed!\n");
+			rxr_cq_handle_rx_error(ep, rx_entry, err);
+		}
 	}
-
-	rxr_pkt_entry_release_rx(ep, pkt_entry);
-	return ret;
 }
 
 void rxr_pkt_handle_data_recv(struct rxr_ep *ep,
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_misc.c b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_misc.c
index 3790e7c3b4ab183dcd7179762b0a6e2964333d2d..80566dbf31d34e073297e0e48cc7aa0fd21adc20 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_misc.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_misc.c
@@ -32,7 +32,6 @@
  */
 
 #include "efa.h"
-#include "efa_cuda.h"
 #include "rxr.h"
 #include "rxr_msg.h"
 #include "rxr_cntr.h"
@@ -181,7 +180,7 @@ ssize_t rxr_pkt_init_cts(struct rxr_ep *ep,
 	cts_hdr->tx_id = rx_entry->tx_id;
 	cts_hdr->rx_id = rx_entry->rx_id;
 
-	bytes_left = rx_entry->total_len - rx_entry->bytes_done;
+	bytes_left = rx_entry->total_len - rx_entry->bytes_received;
 	peer = rxr_ep_get_peer(ep, rx_entry->addr);
 	rxr_pkt_calc_cts_window_credits(ep, peer, bytes_left,
 					rx_entry->credit_request,
@@ -271,6 +270,11 @@ void rxr_pkt_handle_readrsp_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_en
 {
 	struct rxr_tx_entry *tx_entry;
 	size_t data_len;
+	struct efa_domain *efa_domain;
+	struct rxr_domain *rxr_domain = rxr_ep_domain(ep);
+
+	efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
+				  util_domain.domain_fid);
 
 	tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry;
 	data_len = rxr_get_readrsp_hdr(pkt_entry->pkt)->seg_size;
@@ -279,8 +283,8 @@ void rxr_pkt_handle_readrsp_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_en
 	tx_entry->window -= data_len;
 	assert(tx_entry->window >= 0);
 	if (tx_entry->bytes_sent < tx_entry->total_len) {
-		assert(!rxr_ep_is_cuda_mr(tx_entry->desc[0]));
-		if (efa_mr_cache_enable && rxr_ep_mr_local(ep))
+		assert(!efa_ep_is_cuda_mr(tx_entry->desc[0]));
+		if (efa_is_cache_available(efa_domain) && rxr_ep_mr_local(ep))
 			rxr_prepare_desc_send(rxr_ep_domain(ep), tx_entry);
 
 		tx_entry->state = RXR_TX_SEND;
@@ -367,10 +371,12 @@ void rxr_pkt_handle_rma_read_completion(struct rxr_ep *ep,
 {
 	struct rxr_tx_entry *tx_entry;
 	struct rxr_rx_entry *rx_entry;
+	struct rxr_pkt_entry *pkt_entry;
 	struct rxr_read_entry *read_entry;
 	struct rxr_rma_context_pkt *rma_context_pkt;
 	struct rxr_peer *peer;
 	int inject;
+	size_t data_size;
 	ssize_t ret;
 
 	rma_context_pkt = (struct rxr_rma_context_pkt *)context_pkt_entry->pkt;
@@ -382,42 +388,46 @@ void rxr_pkt_handle_rma_read_completion(struct rxr_ep *ep,
 	assert(read_entry->bytes_finished <= read_entry->total_len);
 
 	if (read_entry->bytes_finished == read_entry->total_len) {
-		if (read_entry->x_entry_type == RXR_TX_ENTRY) {
-			tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, read_entry->x_entry_id);
+		if (read_entry->context_type == RXR_READ_CONTEXT_TX_ENTRY) {
+			tx_entry = read_entry->context;
 			assert(tx_entry && tx_entry->cq_entry.flags & FI_READ);
 			rxr_cq_write_tx_completion(ep, tx_entry);
-		} else {
+		} else if (read_entry->context_type == RXR_READ_CONTEXT_RX_ENTRY) {
+			rx_entry = read_entry->context;
+			if (rx_entry->op == ofi_op_msg || rx_entry->op == ofi_op_tagged) {
+				rxr_cq_write_rx_completion(ep, rx_entry);
+			} else {
+				assert(rx_entry->op == ofi_op_write);
+				if (rx_entry->cq_entry.flags & FI_REMOTE_CQ_DATA)
+					rxr_cq_write_rx_completion(ep, rx_entry);
+			}
+
 			inject = (read_entry->lower_ep_type == SHM_EP);
-			rx_entry = ofi_bufpool_get_ibuf(ep->rx_entry_pool, read_entry->x_entry_id);
 			ret = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_EOR_PKT, inject);
 			if (OFI_UNLIKELY(ret)) {
 				if (rxr_cq_handle_rx_error(ep, rx_entry, ret))
 					assert(0 && "failed to write err cq entry");
 				rxr_release_rx_entry(ep, rx_entry);
 			}
-
-			if (inject) {
-				/* inject will not generate a completion, so we write rx completion here,
-				 * otherwise, rx completion is write in rxr_pkt_handle_eor_send_completion
-				 */
-				if (rx_entry->op == ofi_op_msg || rx_entry->op == ofi_op_tagged) {
-					rxr_cq_write_rx_completion(ep, rx_entry);
-				} else {
-					assert(rx_entry->op == ofi_op_write);
-					if (rx_entry->cq_entry.flags & FI_REMOTE_CQ_DATA)
-						rxr_cq_write_rx_completion(ep, rx_entry);
-				}
-
-				rxr_release_rx_entry(ep, rx_entry);
-			}
+		} else {
+			assert(read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY);
+			pkt_entry = read_entry->context;
+			data_size = rxr_pkt_data_size(pkt_entry);
+			assert(data_size > 0);
+			rxr_pkt_handle_data_copied(ep, pkt_entry, data_size);
 		}
 
 		rxr_read_release_entry(ep, read_entry);
 	}
 
-	peer = rxr_ep_get_peer(ep, context_pkt_entry->addr);
-	if (!peer->is_local)
-		rxr_ep_dec_tx_pending(ep, peer, 0);
+	if (read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY) {
+		assert(context_pkt_entry->addr == FI_ADDR_NOTAVAIL);
+		ep->tx_pending--;
+	} else {
+		peer = rxr_ep_get_peer(ep, context_pkt_entry->addr);
+		if (!peer->is_local)
+			rxr_ep_dec_tx_pending(ep, peer, 0);
+	}
 }
 
 void rxr_pkt_handle_rma_completion(struct rxr_ep *ep,
@@ -469,28 +479,20 @@ int rxr_pkt_init_eor(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, struct rx
 	return 0;
 }
 
-void rxr_pkt_handle_eor_send_completion(struct rxr_ep *ep,
-					struct rxr_pkt_entry *pkt_entry)
+void rxr_pkt_handle_eor_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_eor_hdr *eor_hdr;
 	struct rxr_rx_entry *rx_entry;
 
-	eor_hdr = (struct rxr_eor_hdr *)pkt_entry->pkt;
-
-	rx_entry = ofi_bufpool_get_ibuf(ep->rx_entry_pool, eor_hdr->rx_id);
-	assert(rx_entry && rx_entry->rx_id == eor_hdr->rx_id);
-
-	if (rx_entry->op == ofi_op_msg || rx_entry->op == ofi_op_tagged) {
-		rxr_cq_write_rx_completion(ep, rx_entry);
-	} else {
-		assert(rx_entry->op == ofi_op_write);
-		if (rx_entry->cq_entry.flags & FI_REMOTE_CQ_DATA)
-			rxr_cq_write_rx_completion(ep, rx_entry);
-	}
-
+	rx_entry = pkt_entry->x_entry;
+	assert(rx_entry && rx_entry->rx_id == rxr_get_eor_hdr(pkt_entry->pkt)->rx_id);
 	rxr_release_rx_entry(ep, rx_entry);
 }
 
+void rxr_pkt_handle_eor_send_completion(struct rxr_ep *ep,
+					struct rxr_pkt_entry *pkt_entry)
+{
+}
+
 /*
  *   Sender handles the acknowledgment (RXR_EOR_PKT) from receiver on the completion
  *   of the large message copy via fi_readmsg operation
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.c b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.c
index 9627d0cbe58dd90043f351aeeb5dad6bcec04436..8e3738bc4a15d7dab1d5de38b89deac1a7b1823e 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.c
@@ -38,7 +38,6 @@
 #include "rxr_msg.h"
 #include "rxr_pkt_cmd.h"
 #include "rxr_read.h"
-#include "efa_cuda.h"
 
 /*
  * Utility constants and funnctions shared by all REQ packe
@@ -87,8 +86,11 @@ struct rxr_req_inf REQ_INF_LIST[] = {
 
 size_t rxr_pkt_req_data_size(struct rxr_pkt_entry *pkt_entry)
 {
-	assert(pkt_entry->hdr_size > 0);
-	return pkt_entry->pkt_size - pkt_entry->hdr_size;
+	size_t hdr_size;
+
+	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
+	assert(hdr_size > 0);
+	return pkt_entry->pkt_size - hdr_size;
 }
 
 void rxr_pkt_init_req_hdr(struct rxr_ep *ep,
@@ -141,7 +143,6 @@ void rxr_pkt_init_req_hdr(struct rxr_ep *ep,
 	}
 
 	pkt_entry->addr = tx_entry->addr;
-	pkt_entry->hdr_size = opt_hdr - (char *)pkt_entry->pkt;
 }
 
 size_t rxr_pkt_req_base_hdr_size(struct rxr_pkt_entry *pkt_entry)
@@ -168,81 +169,103 @@ size_t rxr_pkt_req_base_hdr_size(struct rxr_pkt_entry *pkt_entry)
 	return hdr_size;
 }
 
-void rxr_pkt_proc_req_common_hdr(struct rxr_pkt_entry *pkt_entry)
+void *rxr_pkt_req_raw_addr(struct rxr_pkt_entry *pkt_entry)
 {
 	char *opt_hdr;
 	struct rxr_base_hdr *base_hdr;
+	struct rxr_req_opt_raw_addr_hdr *raw_addr_hdr;
 
 	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
-
 	opt_hdr = (char *)pkt_entry->pkt + rxr_pkt_req_base_hdr_size(pkt_entry);
 	if (base_hdr->flags & RXR_REQ_OPT_RAW_ADDR_HDR) {
-		struct rxr_req_opt_raw_addr_hdr *raw_addr_hdr;
-
 		raw_addr_hdr = (struct rxr_req_opt_raw_addr_hdr *)opt_hdr;
-		pkt_entry->raw_addr = raw_addr_hdr->raw_addr;
-		opt_hdr += sizeof(*raw_addr_hdr) + raw_addr_hdr->addr_len;
-	} else {
-		pkt_entry->raw_addr = NULL;
+		return raw_addr_hdr->raw_addr;
 	}
 
-	if (base_hdr->flags & RXR_REQ_OPT_CQ_DATA_HDR) {
-		struct rxr_req_opt_cq_data_hdr *cq_data_hdr;
+	return NULL;
+}
 
-		cq_data_hdr = (struct rxr_req_opt_cq_data_hdr *)opt_hdr;
-		pkt_entry->cq_data = cq_data_hdr->cq_data;
-		opt_hdr += sizeof(struct rxr_req_opt_cq_data_hdr);
+size_t rxr_pkt_req_hdr_size(struct rxr_pkt_entry *pkt_entry)
+{
+	char *opt_hdr;
+	struct rxr_base_hdr *base_hdr;
+	struct rxr_req_opt_raw_addr_hdr *raw_addr_hdr;
+
+	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
+	opt_hdr = (char *)pkt_entry->pkt + rxr_pkt_req_base_hdr_size(pkt_entry);
+	if (base_hdr->flags & RXR_REQ_OPT_RAW_ADDR_HDR) {
+		raw_addr_hdr = (struct rxr_req_opt_raw_addr_hdr *)opt_hdr;
+		opt_hdr += sizeof(struct rxr_req_opt_raw_addr_hdr) + raw_addr_hdr->addr_len;
 	}
 
-	pkt_entry->hdr_size = opt_hdr - (char *)pkt_entry->pkt;
+	if (base_hdr->flags & RXR_REQ_OPT_CQ_DATA_HDR)
+		opt_hdr += sizeof(struct rxr_req_opt_cq_data_hdr);
+
+	return opt_hdr - (char *)pkt_entry->pkt;
 }
 
-size_t rxr_pkt_req_max_data_size(struct rxr_ep *ep, fi_addr_t addr, int pkt_type)
+int64_t rxr_pkt_req_cq_data(struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_peer *peer;
-
-	peer = rxr_ep_get_peer(ep, addr);
-	assert(peer);
+	char *opt_hdr;
+	struct rxr_base_hdr *base_hdr;
+	struct rxr_req_opt_cq_data_hdr *cq_data_hdr;
+	struct rxr_req_opt_raw_addr_hdr *raw_addr_hdr;
 
-	if (peer->is_local) {
-		assert(ep->use_shm);
-		return rxr_env.shm_max_medium_size;
+	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
+	opt_hdr = (char *)pkt_entry->pkt + rxr_pkt_req_base_hdr_size(pkt_entry);
+	if (base_hdr->flags & RXR_REQ_OPT_RAW_ADDR_HDR) {
+		raw_addr_hdr = (struct rxr_req_opt_raw_addr_hdr *)opt_hdr;
+		opt_hdr += sizeof(struct rxr_req_opt_raw_addr_hdr) + raw_addr_hdr->addr_len;
 	}
 
+	assert(base_hdr->flags & RXR_REQ_OPT_CQ_DATA_HDR);
+	cq_data_hdr = (struct rxr_req_opt_cq_data_hdr *)opt_hdr;
+	return cq_data_hdr->cq_data;
+}
+
+size_t rxr_pkt_req_max_header_size(int pkt_type)
+{
 	int max_hdr_size = REQ_INF_LIST[pkt_type].base_hdr_size
-		+ sizeof(struct rxr_req_opt_raw_addr_hdr)
+		+ sizeof(struct rxr_req_opt_raw_addr_hdr) + RXR_MAX_NAME_LENGTH
 		+ sizeof(struct rxr_req_opt_cq_data_hdr);
 
 	if (pkt_type == RXR_EAGER_RTW_PKT || pkt_type == RXR_LONG_RTW_PKT)
 		max_hdr_size += RXR_IOV_LIMIT * sizeof(struct fi_rma_iov);
 
-	return ep->mtu_size - max_hdr_size;
+	return max_hdr_size;
 }
 
-static
-size_t rxr_pkt_req_copy_data(struct rxr_rx_entry *rx_entry,
-			     struct rxr_pkt_entry *pkt_entry,
-			     char *data, size_t data_size)
+size_t rxr_pkt_max_header_size(void)
 {
-	size_t bytes_copied;
-	int bytes_left;
+	size_t max_hdr_size = 0;
+	size_t pkt_type = RXR_REQ_PKT_BEGIN;
 
-	bytes_copied = rxr_copy_to_rx(data, data_size, rx_entry, 0);
+	while (pkt_type < RXR_EXTRA_REQ_PKT_END) {
+		max_hdr_size = MAX(max_hdr_size,
+				rxr_pkt_req_max_header_size(pkt_type));
+		if (pkt_type == RXR_BASELINE_REQ_PKT_END)
+			pkt_type = RXR_EXTRA_REQ_PKT_BEGIN;
+		else
+			pkt_type += 1;
+	}
 
-	if (OFI_UNLIKELY(bytes_copied < data_size)) {
-		/* recv buffer is not big enough to hold req, this must be a truncated message */
-		assert(bytes_copied == rx_entry->cq_entry.len &&
-		       rx_entry->cq_entry.len < rx_entry->total_len);
-		rx_entry->bytes_done = bytes_copied;
-		bytes_left = 0;
-	} else {
-		assert(bytes_copied == data_size);
-		rx_entry->bytes_done = data_size;
-		bytes_left = rx_entry->total_len - rx_entry->bytes_done;
+	return max_hdr_size;
+
+}
+
+size_t rxr_pkt_req_max_data_size(struct rxr_ep *ep, fi_addr_t addr, int pkt_type)
+{
+	struct rxr_peer *peer;
+
+	peer = rxr_ep_get_peer(ep, addr);
+	assert(peer);
+
+	if (peer->is_local) {
+		assert(ep->use_shm);
+		return rxr_env.shm_max_medium_size;
 	}
 
-	assert(bytes_left >= 0);
-	return bytes_left;
+	return ep->mtu_size - rxr_pkt_req_max_header_size(pkt_type);
 }
 
 /*
@@ -252,8 +275,7 @@ size_t rxr_pkt_req_copy_data(struct rxr_rx_entry *rx_entry,
  */
 
 /*
- * this function is called after you have set header in pkt_entry->pkt and
- * pkt_entry->hdr_size
+ * this function is called after you have set header in pkt_entry->pkt
  */
 void rxr_pkt_data_from_tx(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry,
 			  struct rxr_tx_entry *tx_entry, size_t data_offset,
@@ -262,38 +284,62 @@ void rxr_pkt_data_from_tx(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry,
 	int tx_iov_index;
 	size_t tx_iov_offset;
 	char *data;
+	size_t hdr_size;
+	struct efa_mr *desc;
 
+	assert(pkt_entry->send);
+	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
+	assert(hdr_size > 0);
 	if (data_size == 0) {
-		pkt_entry->iov_count = 0;
-		pkt_entry->pkt_size = pkt_entry->hdr_size;
+		pkt_entry->send->iov_count = 0;
+		pkt_entry->pkt_size = hdr_size;
 		return;
 	}
 
 	rxr_locate_iov_pos(tx_entry->iov, tx_entry->iov_count, data_offset,
 			   &tx_iov_index, &tx_iov_offset);
+	desc = tx_entry->desc[0];
 	assert(tx_iov_index < tx_entry->iov_count);
 	assert(tx_iov_offset < tx_entry->iov[tx_iov_index].iov_len);
-	assert(pkt_entry->hdr_size > 0);
-	if (!tx_entry->desc[tx_iov_index]) {
-		data = (char *)pkt_entry->pkt + pkt_entry->hdr_size;
-		data_size = rxr_copy_from_tx(data, data_size, tx_entry, data_offset);
-		pkt_entry->iov_count = 0;
-		pkt_entry->pkt_size = pkt_entry->hdr_size + data_size;
+
+	/*
+	 * We want to go through the bounce-buffers here only when
+	 * one of the following conditions are true:
+	 * 1. The application can not register buffers (no FI_MR_LOCAL)
+	 * 2. desc.peer.iface is anything but FI_HMEM_SYSTEM
+	 * 3. prov/shm is not used for this transfer, and #1 or #2 hold true.
+	 *
+	 * In the first case, we use the pre-registered pkt_entry's MR. In the
+	 * second case, this is for the eager and medium-message protocols which
+	 * can not rendezvous and pull the data from a peer. In the third case,
+	 * the bufpool would not have been created with a registration handler,
+	 * so pkt_entry->mr will be NULL.
+	 *
+	 */
+	if (!tx_entry->desc[tx_iov_index] && pkt_entry->mr) {
+		data = (char *)pkt_entry->pkt + hdr_size;
+		data_size = ofi_copy_from_hmem_iov(data,
+					data_size,
+					desc ? desc->peer.iface : FI_HMEM_SYSTEM,
+					desc ? desc->peer.device.reserved : 0,
+					tx_entry->iov,
+					tx_entry->iov_count,
+					data_offset);
+		pkt_entry->send->iov_count = 0;
+		pkt_entry->pkt_size = hdr_size + data_size;
 		return;
 	}
 
-	/* when desc is available, we use it instead of copying */
 	assert(ep->core_iov_limit >= 2);
-	pkt_entry->iov[0].iov_base = pkt_entry->pkt;
-	pkt_entry->iov[0].iov_len = pkt_entry->hdr_size;
-	pkt_entry->desc[0] = fi_mr_desc(pkt_entry->mr);
+	pkt_entry->send->iov[0].iov_base = pkt_entry->pkt;
+	pkt_entry->send->iov[0].iov_len = hdr_size;
+	pkt_entry->send->desc[0] = pkt_entry->mr ? fi_mr_desc(pkt_entry->mr) : NULL;
 
-	pkt_entry->iov[1].iov_base = (char *)tx_entry->iov[tx_iov_index].iov_base + tx_iov_offset;
-	pkt_entry->iov[1].iov_len = MIN(data_size,
-					tx_entry->iov[tx_iov_index].iov_len - tx_iov_offset);
-	pkt_entry->desc[1] = tx_entry->desc[tx_iov_index];
-	pkt_entry->iov_count = 2;
-	pkt_entry->pkt_size = pkt_entry->hdr_size + pkt_entry->iov[1].iov_len;
+	pkt_entry->send->iov[1].iov_base = (char *)tx_entry->iov[tx_iov_index].iov_base + tx_iov_offset;
+	pkt_entry->send->iov[1].iov_len = MIN(data_size, tx_entry->iov[tx_iov_index].iov_len - tx_iov_offset);
+	pkt_entry->send->desc[1] = tx_entry->desc[tx_iov_index];
+	pkt_entry->send->iov_count = 2;
+	pkt_entry->pkt_size = hdr_size + pkt_entry->send->iov[1].iov_len;
 }
 
 void rxr_pkt_init_rtm(struct rxr_ep *ep,
@@ -303,14 +349,14 @@ void rxr_pkt_init_rtm(struct rxr_ep *ep,
 {
 	size_t data_size;
 	struct rxr_rtm_base_hdr *rtm_hdr;
-	/* this function set pkt_entry->hdr_size */
 	rxr_pkt_init_req_hdr(ep, tx_entry, pkt_type, pkt_entry);
 
 	rtm_hdr = (struct rxr_rtm_base_hdr *)pkt_entry->pkt;
 	rtm_hdr->flags |= RXR_REQ_MSG;
 	rtm_hdr->msg_id = tx_entry->msg_id;
 
-	data_size = MIN(tx_entry->total_len - data_offset, ep->mtu_size - pkt_entry->hdr_size);
+	data_size = MIN(tx_entry->total_len - data_offset,
+			ep->mtu_size - rxr_pkt_req_hdr_size(pkt_entry));
 	rxr_pkt_data_from_tx(ep, pkt_entry, tx_entry, data_offset, data_size);
 	pkt_entry->x_entry = tx_entry;
 }
@@ -320,6 +366,7 @@ ssize_t rxr_pkt_init_eager_msgrtm(struct rxr_ep *ep,
 				  struct rxr_pkt_entry *pkt_entry)
 {
 	rxr_pkt_init_rtm(ep, tx_entry, RXR_EAGER_MSGRTM_PKT, 0, pkt_entry);
+	assert(tx_entry->total_len == rxr_pkt_req_data_size(pkt_entry));
 	return 0;
 }
 
@@ -330,6 +377,7 @@ ssize_t rxr_pkt_init_eager_tagrtm(struct rxr_ep *ep,
 	struct rxr_base_hdr *base_hdr;
 
 	rxr_pkt_init_rtm(ep, tx_entry, RXR_EAGER_TAGRTM_PKT, 0, pkt_entry);
+	assert(tx_entry->total_len == rxr_pkt_req_data_size(pkt_entry));
 	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
 	base_hdr->flags |= RXR_REQ_TAGGED;
 	rxr_pkt_rtm_settag(pkt_entry, tx_entry->tag);
@@ -408,6 +456,7 @@ ssize_t rxr_pkt_init_read_rtm(struct rxr_ep *ep,
 {
 	struct rxr_read_rtm_base_hdr *rtm_hdr;
 	struct fi_rma_iov *read_iov;
+	size_t hdr_size;
 	int err;
 
 	rxr_pkt_init_req_hdr(ep, tx_entry, pkt_type, pkt_entry);
@@ -419,12 +468,13 @@ ssize_t rxr_pkt_init_read_rtm(struct rxr_ep *ep,
 	rtm_hdr->tx_id = tx_entry->tx_id;
 	rtm_hdr->read_iov_count = tx_entry->iov_count;
 
-	read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + pkt_entry->hdr_size);
+	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
+	read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + hdr_size);
 	err = rxr_read_init_iov(ep, tx_entry, read_iov);
 	if (OFI_UNLIKELY(err))
 		return err;
 
-	pkt_entry->pkt_size = pkt_entry->hdr_size + tx_entry->iov_count * sizeof(struct fi_rma_iov);
+	pkt_entry->pkt_size = hdr_size + tx_entry->iov_count * sizeof(struct fi_rma_iov);
 	return 0;
 }
 
@@ -472,12 +522,17 @@ void rxr_pkt_handle_long_rtm_sent(struct rxr_ep *ep,
 				  struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_tx_entry *tx_entry;
+	struct efa_domain *efa_domain;
+	struct rxr_domain *rxr_domain = rxr_ep_domain(ep);
+
+	efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
+				  util_domain.domain_fid);
 
 	tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry;
 	tx_entry->bytes_sent += rxr_pkt_req_data_size(pkt_entry);
 	assert(tx_entry->bytes_sent < tx_entry->total_len);
 
-	if (efa_mr_cache_enable || rxr_ep_is_cuda_mr(tx_entry->desc[0]))
+	if (efa_is_cache_available(efa_domain) || efa_ep_is_cuda_mr(tx_entry->desc[0]))
 		rxr_prepare_desc_send(rxr_ep_domain(ep), tx_entry);
 }
 
@@ -553,7 +608,7 @@ void rxr_pkt_rtm_init_rx_entry(struct rxr_pkt_entry *pkt_entry,
 	if (base_hdr->flags & RXR_REQ_OPT_CQ_DATA_HDR) {
 		rx_entry->rxr_flags |= RXR_REMOTE_CQ_DATA;
 		rx_entry->cq_entry.flags |= FI_REMOTE_CQ_DATA;
-		rx_entry->cq_entry.data = pkt_entry->cq_data;
+		rx_entry->cq_entry.data = rxr_pkt_req_cq_data(pkt_entry);
 	}
 
 	rx_entry->addr = pkt_entry->addr;
@@ -723,7 +778,7 @@ ssize_t rxr_pkt_proc_matched_read_rtm(struct rxr_ep *ep,
 	struct fi_rma_iov *read_iov;
 
 	rtm_hdr = rxr_get_read_rtm_base_hdr(pkt_entry->pkt);
-	read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + pkt_entry->hdr_size);
+	read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + rxr_pkt_req_hdr_size(pkt_entry));
 
 	rx_entry->tx_id = rtm_hdr->tx_id;
 	rx_entry->rma_iov_count = rtm_hdr->read_iov_count;
@@ -736,41 +791,49 @@ ssize_t rxr_pkt_proc_matched_read_rtm(struct rxr_ep *ep,
 	 * need to do memory registration for the receiving buffer.
 	 */
 	ofi_truncate_iov(rx_entry->iov, &rx_entry->iov_count, rx_entry->total_len);
-	return rxr_read_post_or_queue(ep, RXR_RX_ENTRY, rx_entry);
+	return rxr_read_post_remote_read_or_queue(ep, RXR_RX_ENTRY, rx_entry);
 }
 
 ssize_t rxr_pkt_proc_matched_medium_rtm(struct rxr_ep *ep,
 					struct rxr_rx_entry *rx_entry,
 					struct rxr_pkt_entry *pkt_entry)
 {
-	struct rxr_pkt_entry *cur;
+	struct rxr_pkt_entry *cur, *nxt;
 	char *data;
-	size_t offset, data_size;
+	ssize_t ret, err;
+	size_t offset, hdr_size, data_size;
 
+	ret = 0;
 	cur = pkt_entry;
 	while (cur) {
-		data = (char *)cur->pkt + cur->hdr_size;
+		hdr_size = rxr_pkt_req_hdr_size(cur);
+		data = (char *)cur->pkt + hdr_size;
 		offset = rxr_get_medium_rtm_base_hdr(cur->pkt)->offset;
-		data_size = cur->pkt_size - cur->hdr_size;
-		rxr_copy_to_rx(data, data_size, rx_entry, offset);
-		rx_entry->bytes_done += data_size;
-		cur = cur->next;
-	}
+		data_size = cur->pkt_size - hdr_size;
 
-	if (rx_entry->total_len == rx_entry->bytes_done) {
-		rxr_pkt_rx_map_remove(ep, pkt_entry, rx_entry);
-		/*
-		 * rxr_cq_handle_rx_completion() releases pkt_entry, thus
-		 * we do not release it here.
+		/* rxr_pkt_copy_to_rx() can release rx_entry, so
+		 * bytes_received must be calculated before it.
 		 */
-		rxr_cq_handle_rx_completion(ep, pkt_entry, rx_entry);
-		rxr_msg_multi_recv_free_posted_entry(ep, rx_entry);
-		rxr_release_rx_entry(ep, rx_entry);
-		return 0;
+		rx_entry->bytes_received += data_size;
+		if (rx_entry->total_len == rx_entry->bytes_received)
+			rxr_pkt_rx_map_remove(ep, cur, rx_entry);
+
+		/* rxr_pkt_copy_to_rx() will release cur, so
+		 * cur->next must be copied out before it.
+		 */
+		nxt = cur->next;
+		cur->next = NULL;
+
+		err = rxr_pkt_copy_to_rx(ep, rx_entry, offset, cur, data, data_size);
+		if (err) {
+			rxr_pkt_entry_release_rx(ep, cur);
+			ret = err;
+		}
+
+		cur = nxt;
 	}
 
-	rxr_pkt_entry_release_rx(ep, pkt_entry);
-	return 0;
+	return ret;
 }
 
 ssize_t rxr_pkt_proc_matched_rtm(struct rxr_ep *ep,
@@ -779,7 +842,7 @@ ssize_t rxr_pkt_proc_matched_rtm(struct rxr_ep *ep,
 {
 	int pkt_type;
 	char *data;
-	size_t data_size, bytes_left;
+	size_t hdr_size, data_size;
 	ssize_t ret;
 
 	assert(rx_entry->state == RXR_RX_MATCHED);
@@ -801,18 +864,18 @@ ssize_t rxr_pkt_proc_matched_rtm(struct rxr_ep *ep,
 	if (pkt_type == RXR_MEDIUM_MSGRTM_PKT || pkt_type == RXR_MEDIUM_TAGRTM_PKT)
 		return rxr_pkt_proc_matched_medium_rtm(ep, rx_entry, pkt_entry);
 
-	data = (char *)pkt_entry->pkt + pkt_entry->hdr_size;
-	data_size = pkt_entry->pkt_size - pkt_entry->hdr_size;
-	bytes_left = rxr_pkt_req_copy_data(rx_entry, pkt_entry,
-					   data, data_size);
-	if (!bytes_left) {
-		/*
-		 * rxr_cq_handle_rx_completion() releases pkt_entry, thus
-		 * we do not release it here.
-		 */
-		rxr_cq_handle_rx_completion(ep, pkt_entry, rx_entry);
-		rxr_msg_multi_recv_free_posted_entry(ep, rx_entry);
-		rxr_release_rx_entry(ep, rx_entry);
+	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
+	data = (char *)pkt_entry->pkt + hdr_size;
+	data_size = pkt_entry->pkt_size - hdr_size;
+
+	rx_entry->bytes_received += data_size;
+	ret = rxr_pkt_copy_to_rx(ep, rx_entry, 0, pkt_entry, data, data_size);
+	if (ret) {
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
+		return ret;
+	}
+
+	if (pkt_type == RXR_EAGER_MSGRTM_PKT || pkt_type == RXR_EAGER_TAGRTM_PKT) {
 		ret = 0;
 	} else {
 		/*
@@ -827,7 +890,6 @@ ssize_t rxr_pkt_proc_matched_rtm(struct rxr_ep *ep,
 		/* we have noticed using the default value achieve better bandwidth */
 		rx_entry->credit_request = rxr_env.tx_min_credits;
 		ret = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_CTS_PKT, 0);
-		rxr_pkt_entry_release_rx(ep, pkt_entry);
 	}
 
 	return ret;
@@ -930,6 +992,51 @@ ssize_t rxr_pkt_proc_rtm_rta(struct rxr_ep *ep,
 	return -FI_EINVAL;
 }
 
+void rxr_pkt_handle_zcpy_recv(struct rxr_ep *ep,
+			      struct rxr_pkt_entry *pkt_entry)
+{
+	struct rxr_rx_entry *rx_entry;
+
+	struct rxr_base_hdr *base_hdr __attribute__((unused));
+	base_hdr = rxr_get_base_hdr(pkt_entry->pkt);
+	assert(base_hdr->type >= RXR_BASELINE_REQ_PKT_BEGIN);
+	assert(base_hdr->type != RXR_MEDIUM_MSGRTM_PKT);
+	assert(base_hdr->type != RXR_MEDIUM_TAGRTM_PKT);
+	assert(pkt_entry->type == RXR_PKT_ENTRY_USER);
+
+	rx_entry = rxr_pkt_get_msgrtm_rx_entry(ep, &pkt_entry);
+	if (OFI_UNLIKELY(!rx_entry)) {
+		efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS);
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
+		return;
+	}
+	pkt_entry->x_entry = rx_entry;
+	if (rx_entry->state != RXR_RX_MATCHED)
+		return;
+
+	/*
+	 * The incoming receive will always get matched to the first posted
+	 * rx_entry available, so this is a constant cost. No real tag or
+	 * address matching happens.
+	 */
+	assert(rx_entry->state == RXR_RX_MATCHED);
+
+	/*
+	 * Adjust rx_entry->cq_entry.len as needed.
+	 * Initialy rx_entry->cq_entry.len is total recv buffer size.
+	 * rx_entry->total_len is from REQ packet and is total send buffer size.
+	 * if send buffer size < recv buffer size, we adjust value of rx_entry->cq_entry.len
+	 * if send buffer size > recv buffer size, we have a truncated message and will
+	 * write error CQ entry.
+	 */
+	if (rx_entry->cq_entry.len > rx_entry->total_len)
+		rx_entry->cq_entry.len = rx_entry->total_len;
+
+	rxr_cq_write_rx_completion(ep, rx_entry);
+	rxr_pkt_entry_release_rx(ep, pkt_entry);
+	rxr_release_rx_entry(ep, rx_entry);
+}
+
 void rxr_pkt_handle_rtm_rta_recv(struct rxr_ep *ep,
 				 struct rxr_pkt_entry *pkt_entry)
 {
@@ -1034,6 +1141,7 @@ void rxr_pkt_init_rtw_data(struct rxr_ep *ep,
 			   struct fi_rma_iov *rma_iov)
 {
 	char *data;
+	size_t hdr_size;
 	size_t data_size;
 	int i;
 
@@ -1043,11 +1151,12 @@ void rxr_pkt_init_rtw_data(struct rxr_ep *ep,
 		rma_iov[i].key = tx_entry->rma_iov[i].key;
 	}
 
-	data = (char *)pkt_entry->pkt + pkt_entry->hdr_size;
-	data_size = ofi_copy_from_iov(data, ep->mtu_size - pkt_entry->hdr_size,
+	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
+	data = (char *)pkt_entry->pkt + hdr_size;
+	data_size = ofi_copy_from_iov(data, ep->mtu_size - hdr_size,
 				      tx_entry->iov, tx_entry->iov_count, 0);
 
-	pkt_entry->pkt_size = pkt_entry->hdr_size + data_size;
+	pkt_entry->pkt_size = hdr_size + data_size;
 	pkt_entry->x_entry = tx_entry;
 }
 
@@ -1090,6 +1199,7 @@ ssize_t rxr_pkt_init_read_rtw(struct rxr_ep *ep,
 {
 	struct rxr_read_rtw_hdr *rtw_hdr;
 	struct fi_rma_iov *rma_iov, *read_iov;
+	size_t hdr_size;
 	int i, err;
 
 	assert(tx_entry->op == ofi_op_write);
@@ -1108,12 +1218,13 @@ ssize_t rxr_pkt_init_read_rtw(struct rxr_ep *ep,
 		rma_iov[i].key = tx_entry->rma_iov[i].key;
 	}
 
-	read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + pkt_entry->hdr_size);
+	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
+	read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + hdr_size);
 	err = rxr_read_init_iov(ep, tx_entry, read_iov);
 	if (OFI_UNLIKELY(err))
 		return err;
 
-	pkt_entry->pkt_size = pkt_entry->hdr_size + tx_entry->iov_count * sizeof(struct fi_rma_iov);
+	pkt_entry->pkt_size = hdr_size + tx_entry->iov_count * sizeof(struct fi_rma_iov);
 	return 0;
 }
 
@@ -1126,11 +1237,16 @@ void rxr_pkt_handle_long_rtw_sent(struct rxr_ep *ep,
 				  struct rxr_pkt_entry *pkt_entry)
 {
 	struct rxr_tx_entry *tx_entry;
+	struct efa_domain *efa_domain;
+	struct rxr_domain *rxr_domain = rxr_ep_domain(ep);
+
+	efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
+				  util_domain.domain_fid);
 
 	tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry;
 	tx_entry->bytes_sent += rxr_pkt_req_data_size(pkt_entry);
 	assert(tx_entry->bytes_sent < tx_entry->total_len);
-	if (efa_mr_cache_enable || rxr_ep_is_cuda_mr(tx_entry->desc[0]))
+	if (efa_is_cache_available(efa_domain) || efa_ep_is_cuda_mr(tx_entry->desc[0]))
 		rxr_prepare_desc_send(rxr_ep_domain(ep), tx_entry);
 }
 
@@ -1179,11 +1295,12 @@ struct rxr_rx_entry *rxr_pkt_alloc_rtw_rx_entry(struct rxr_ep *ep,
 	if (base_hdr->flags & RXR_REQ_OPT_CQ_DATA_HDR) {
 		rx_entry->rxr_flags |= RXR_REMOTE_CQ_DATA;
 		rx_entry->cq_entry.flags |= FI_REMOTE_CQ_DATA;
-		rx_entry->cq_entry.data = pkt_entry->cq_data;
+		rx_entry->cq_entry.data = rxr_pkt_req_cq_data(pkt_entry);
 	}
 
 	rx_entry->addr = pkt_entry->addr;
-	rx_entry->bytes_done = 0;
+	rx_entry->bytes_received = 0;
+	rx_entry->bytes_copied = 0;
 	return rx_entry;
 }
 
@@ -1193,8 +1310,8 @@ void rxr_pkt_handle_eager_rtw_recv(struct rxr_ep *ep,
 	struct rxr_rx_entry *rx_entry;
 	struct rxr_eager_rtw_hdr *rtw_hdr;
 	char *data;
-	size_t data_size;
-	ssize_t err, bytes_left;
+	size_t data_size, hdr_size;
+	ssize_t err;
 
 	rx_entry = rxr_pkt_alloc_rtw_rx_entry(ep, pkt_entry);
 	if (!rx_entry) {
@@ -1222,25 +1339,26 @@ void rxr_pkt_handle_eager_rtw_recv(struct rxr_ep *ep,
 	rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base;
 	rx_entry->total_len = rx_entry->cq_entry.len;
 
-	data = (char *)pkt_entry->pkt + pkt_entry->hdr_size;
-	data_size = pkt_entry->pkt_size - pkt_entry->hdr_size;
-	bytes_left = rxr_pkt_req_copy_data(rx_entry, pkt_entry, data, data_size);
-	if (bytes_left != 0) {
-		FI_WARN(&rxr_prov, FI_LOG_CQ, "Eager RTM bytes_left is %ld, which should be 0.",
-			bytes_left);
+	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
+	data = (char *)pkt_entry->pkt + hdr_size;
+	data_size = pkt_entry->pkt_size - hdr_size;
+
+	rx_entry->bytes_received += data_size;
+	if (data_size != rx_entry->total_len) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "Eager RTM size mismatch! data_size: %ld total_len: %ld.",
+			data_size, rx_entry->total_len);
 		FI_WARN(&rxr_prov, FI_LOG_CQ, "target buffer: %p length: %ld", rx_entry->iov[0].iov_base,
 			rx_entry->iov[0].iov_len);
-		efa_eq_write_error(&ep->util_ep, FI_EINVAL, -FI_EINVAL);
-		rxr_release_rx_entry(ep, rx_entry);
-		rxr_pkt_entry_release_rx(ep, pkt_entry);
-		return;
+		err = FI_EINVAL;
+	} else {
+		err = rxr_pkt_copy_to_rx(ep, rx_entry, 0, pkt_entry, data, data_size);
 	}
 
-	if (rx_entry->cq_entry.flags & FI_REMOTE_CQ_DATA)
-		rxr_cq_write_rx_completion(ep, rx_entry);
-
-	rxr_release_rx_entry(ep, rx_entry);
-	rxr_pkt_entry_release_rx(ep, pkt_entry);
+	if (err) {
+		efa_eq_write_error(&ep->util_ep, err, -err);
+		rxr_pkt_entry_release_rx(ep, pkt_entry);
+		rxr_release_rx_entry(ep, rx_entry);
+	}
 }
 
 void rxr_pkt_handle_long_rtw_recv(struct rxr_ep *ep,
@@ -1249,8 +1367,8 @@ void rxr_pkt_handle_long_rtw_recv(struct rxr_ep *ep,
 	struct rxr_rx_entry *rx_entry;
 	struct rxr_long_rtw_hdr *rtw_hdr;
 	char *data;
-	size_t data_size;
-	ssize_t err, bytes_left;
+	size_t hdr_size, data_size;
+	ssize_t err;
 
 	rx_entry = rxr_pkt_alloc_rtw_rx_entry(ep, pkt_entry);
 	if (!rx_entry) {
@@ -1278,14 +1396,22 @@ void rxr_pkt_handle_long_rtw_recv(struct rxr_ep *ep,
 	rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base;
 	rx_entry->total_len = rx_entry->cq_entry.len;
 
-	data = (char *)pkt_entry->pkt + pkt_entry->hdr_size;
-	data_size = pkt_entry->pkt_size - pkt_entry->hdr_size;
-	bytes_left = rxr_pkt_req_copy_data(rx_entry, pkt_entry, data, data_size);
-	if (OFI_UNLIKELY(bytes_left <= 0)) {
-		FI_WARN(&rxr_prov, FI_LOG_CQ, "Long RTM bytes_left is %ld, which should be > 0.",
-			bytes_left);
+	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
+	data = (char *)pkt_entry->pkt + hdr_size;
+	data_size = pkt_entry->pkt_size - hdr_size;
+
+	rx_entry->bytes_received += data_size;
+	if (data_size >= rx_entry->total_len) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ, "Long RTM size mismatch! pkt_data_size: %ld total_len: %ld\n",
+			data_size, rx_entry->total_len);
 		FI_WARN(&rxr_prov, FI_LOG_CQ, "target buffer: %p length: %ld", rx_entry->iov[0].iov_base,
 			rx_entry->iov[0].iov_len);
+		err = FI_EINVAL;
+	} else {
+		err = rxr_pkt_copy_to_rx(ep, rx_entry, 0, pkt_entry, data, data_size);
+	}
+
+	if (err) {
 		efa_eq_write_error(&ep->util_ep, FI_EINVAL, -FI_EINVAL);
 		rxr_release_rx_entry(ep, rx_entry);
 		rxr_pkt_entry_release_rx(ep, pkt_entry);
@@ -1305,7 +1431,6 @@ void rxr_pkt_handle_long_rtw_recv(struct rxr_ep *ep,
 		rxr_cq_handle_rx_error(ep, rx_entry, err);
 		rxr_release_rx_entry(ep, rx_entry);
 	}
-	rxr_pkt_entry_release_rx(ep, pkt_entry);
 }
 
 void rxr_pkt_handle_read_rtw_recv(struct rxr_ep *ep,
@@ -1314,6 +1439,7 @@ void rxr_pkt_handle_read_rtw_recv(struct rxr_ep *ep,
 	struct rxr_rx_entry *rx_entry;
 	struct rxr_read_rtw_hdr *rtw_hdr;
 	struct fi_rma_iov *read_iov;
+	size_t hdr_size;
 	ssize_t err;
 
 	rx_entry = rxr_pkt_alloc_rtw_rx_entry(ep, pkt_entry);
@@ -1342,7 +1468,8 @@ void rxr_pkt_handle_read_rtw_recv(struct rxr_ep *ep,
 	rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base;
 	rx_entry->total_len = rx_entry->cq_entry.len;
 
-	read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + pkt_entry->hdr_size);
+	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
+	read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + hdr_size);
 	rx_entry->addr = pkt_entry->addr;
 	rx_entry->tx_id = rtw_hdr->tx_id;
 	rx_entry->rma_iov_count = rtw_hdr->read_iov_count;
@@ -1350,7 +1477,7 @@ void rxr_pkt_handle_read_rtw_recv(struct rxr_ep *ep,
 	       rx_entry->rma_iov_count * sizeof(struct fi_rma_iov));
 
 	rxr_pkt_entry_release_rx(ep, pkt_entry);
-	err = rxr_read_post_or_queue(ep, RXR_RX_ENTRY, rx_entry);
+	err = rxr_read_post_remote_read_or_queue(ep, RXR_RX_ENTRY, rx_entry);
 	if (OFI_UNLIKELY(err)) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
 			"RDMA post read or queue failed.\n");
@@ -1385,7 +1512,7 @@ void rxr_pkt_init_rtr(struct rxr_ep *ep,
 		rtr_hdr->rma_iov[i].key = tx_entry->rma_iov[i].key;
 	}
 
-	pkt_entry->pkt_size = pkt_entry->hdr_size;
+	pkt_entry->pkt_size = rxr_pkt_req_hdr_size(pkt_entry);
 	pkt_entry->x_entry = tx_entry;
 }
 
@@ -1454,7 +1581,8 @@ void rxr_pkt_handle_rtr_recv(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 	}
 
 	rx_entry->addr = pkt_entry->addr;
-	rx_entry->bytes_done = 0;
+	rx_entry->bytes_received = 0;
+	rx_entry->bytes_copied = 0;
 	rx_entry->cq_entry.flags |= (FI_RMA | FI_READ);
 	rx_entry->cq_entry.len = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count);
 	rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base;
@@ -1506,7 +1634,7 @@ ssize_t rxr_pkt_init_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 	struct fi_rma_iov *rma_iov;
 	struct rxr_rta_hdr *rta_hdr;
 	char *data;
-	size_t data_size;
+	size_t hdr_size, data_size;
 	int i;
 
 	rta_hdr = (struct rxr_rta_hdr *)pkt_entry->pkt;
@@ -1524,11 +1652,12 @@ ssize_t rxr_pkt_init_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry,
 		rma_iov[i].key = tx_entry->rma_iov[i].key;
 	}
 
-	data = (char *)pkt_entry->pkt + pkt_entry->hdr_size;
-	data_size = ofi_copy_from_iov(data, ep->mtu_size - pkt_entry->hdr_size,
+	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
+	data = (char *)pkt_entry->pkt + hdr_size;
+	data_size = ofi_copy_from_iov(data, ep->mtu_size - hdr_size,
 				      tx_entry->iov, tx_entry->iov_count, 0);
 
-	pkt_entry->pkt_size = pkt_entry->hdr_size + data_size;
+	pkt_entry->pkt_size = hdr_size + data_size;
 	pkt_entry->x_entry = tx_entry;
 	return 0;
 }
@@ -1581,14 +1710,15 @@ int rxr_pkt_proc_write_rta(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 	struct rxr_rta_hdr *rta_hdr;
 	char *data;
 	int iov_count, op, dt, i;
-	size_t dtsize, offset;
+	size_t dtsize, offset, hdr_size;
 
 	rta_hdr = (struct rxr_rta_hdr *)pkt_entry->pkt;
 	op = rta_hdr->atomic_op;
 	dt = rta_hdr->atomic_datatype;
 	dtsize = ofi_datatype_size(dt);
 	
-	data = (char *)pkt_entry->pkt + pkt_entry->hdr_size;
+	hdr_size = rxr_pkt_req_hdr_size(pkt_entry);
+	data = (char *)pkt_entry->pkt + hdr_size;
 	iov_count = rta_hdr->rma_iov_count;
 	rxr_rma_verified_copy_iov(ep, rta_hdr->rma_iov, iov_count, FI_REMOTE_WRITE, iov);
 
@@ -1667,7 +1797,7 @@ int rxr_pkt_proc_fetch_rta(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
  	dt = rx_entry->atomic_hdr.datatype;	
 	dtsize = ofi_datatype_size(rx_entry->atomic_hdr.datatype);
 
-	data = (char *)pkt_entry->pkt + pkt_entry->hdr_size;
+	data = (char *)pkt_entry->pkt + rxr_pkt_req_hdr_size(pkt_entry);
 	rx_entry->atomrsp_buf = (char *)rx_entry->atomrsp_pkt->pkt + sizeof(struct rxr_atomrsp_hdr);
 
 	offset = 0;
@@ -1708,7 +1838,7 @@ int rxr_pkt_proc_compare_rta(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry)
 	dt = rx_entry->atomic_hdr.datatype;
        	dtsize = ofi_datatype_size(rx_entry->atomic_hdr.datatype);
 
-	src_data = (char *)pkt_entry->pkt + pkt_entry->hdr_size;
+	src_data = (char *)pkt_entry->pkt + rxr_pkt_req_hdr_size(pkt_entry);
 	cmp_data = src_data + rx_entry->total_len;
 	rx_entry->atomrsp_buf = (char *)rx_entry->atomrsp_pkt->pkt + sizeof(struct rxr_atomrsp_hdr);
 
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.h b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.h
index 58513fe14d4dd3f1df4a2ea920c7a1ed4bec15e7..401674aaa3d5ed73d1a4913279c5ae9c4b243b22 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_pkt_type_req.h
@@ -86,10 +86,18 @@ struct rxr_req_opt_cq_data_hdr {
 	int64_t cq_data;
 };
 
-void rxr_pkt_proc_req_common_hdr(struct rxr_pkt_entry *pkt_entry);
+void *rxr_pkt_req_raw_addr(struct rxr_pkt_entry *pkt_entry);
+
+int64_t rxr_pkt_req_cq_data(struct rxr_pkt_entry *pkt_entry);
+
+size_t rxr_pkt_req_hdr_size(struct rxr_pkt_entry *pkt_entry);
 
 size_t rxr_pkt_req_base_hdr_size(struct rxr_pkt_entry *pkt_entry);
 
+size_t rxr_pkt_req_max_header_size(int pkt_type);
+
+size_t rxr_pkt_max_header_size(void);
+
 size_t rxr_pkt_req_max_data_size(struct rxr_ep *ep, fi_addr_t addr, int pkt_type);
 
 /*
@@ -337,6 +345,11 @@ ssize_t rxr_pkt_proc_matched_rtm(struct rxr_ep *ep,
 
 ssize_t rxr_pkt_proc_rtm_rta(struct rxr_ep *ep,
 			     struct rxr_pkt_entry *pkt_entry);
+/*
+ *         This function handles zero-copy receives that do not require ordering
+ */
+void rxr_pkt_handle_zcpy_recv(struct rxr_ep *ep,
+			      struct rxr_pkt_entry *pkt_entry);
 /*
  *         This function is shared by all RTM packet types which handle
  *         reordering
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_read.c b/deps/libfabric/prov/efa/src/rxr/rxr_read.c
index 5b75a65e041dbbe9531bee8baf33141ab1b93368..5ec4db0980259985fc080d6f74059b1003faf571 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_read.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_read.c
@@ -79,15 +79,125 @@ int rxr_locate_rma_iov_pos(struct fi_rma_iov *rma_iov, int rma_iov_count, size_t
 	return -1;
 }
 
+/*
+ * rxr_read_prepare_pkt_entry_mr() ensure pkt_entry's memory is registered.
+ *
+ * For a packet entry whose memory is not registered, it will reserve a pkt entry
+ * from rx_readcopy_pkt_pool and copy data their.
+ *
+ * Return value:
+ *
+ *     On success, return 0
+ *     On pack entry reservation failure, return -FI_EAGAIN
+ */
+static
+ssize_t rxr_read_prepare_pkt_entry_mr(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
+{
+	size_t pkt_offset;
+	struct rxr_pkt_entry *pkt_entry;
+	struct rxr_pkt_entry *pkt_entry_copy;
+
+	assert(read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY);
+	/*
+	 * In this case, target buffer is data in a pkt_entry, so rma_iov_count must be 1.
+	 */
+	assert(read_entry->rma_iov_count == 1);
+
+	pkt_entry = read_entry->context;
+	if (pkt_entry->mr) {
+		assert(read_entry->rma_iov[0].key == fi_mr_key(pkt_entry->mr));
+		return 0;
+	}
+
+	/* only ooo and unexp packet entry's memory is not registered with device */
+	assert(pkt_entry->type == RXR_PKT_ENTRY_OOO ||
+	       pkt_entry->type == RXR_PKT_ENTRY_UNEXP);
+
+	pkt_offset = (char *)read_entry->rma_iov[0].addr - (char *)pkt_entry->pkt;
+	assert(pkt_offset > sizeof(struct rxr_base_hdr));
+
+	pkt_entry_copy = rxr_pkt_entry_clone(ep, ep->rx_readcopy_pkt_pool,
+					     pkt_entry, RXR_PKT_ENTRY_READ_COPY);
+	if (!pkt_entry_copy) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ,
+			"readcopy pkt pool exhausted! Set FI_EFA_READCOPY_POOL_SIZE to a higher value!");
+		return -FI_EAGAIN;
+	}
+
+	rxr_pkt_entry_release_rx(ep, pkt_entry);
+
+	assert(pkt_entry_copy->mr);
+	read_entry->context = pkt_entry_copy;
+	read_entry->rma_iov[0].addr = (uint64_t)pkt_entry_copy->pkt + pkt_offset;
+	read_entry->rma_iov[0].key = fi_mr_key(pkt_entry_copy->mr);
+
+	return 0;
+}
+
+/*
+ * rxr_read_mr_reg register the memory of local buffer if application did not
+ * provide descriptor.
+ * It is called by rxr_read_post().
+ * On success, it return 0.
+ * If memory registration failed with -FI_ENOMEM, it will return -FI_EAGAIN.
+ * If memory registration failed with other error, it will return the error code.
+ */
+ssize_t rxr_read_mr_reg(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
+{
+	size_t i;
+	int err;
+
+	for (i = 0; i < read_entry->iov_count; ++i) {
+		if (read_entry->mr_desc[i] || read_entry->mr[i]) {
+			continue;
+		}
+
+		err = fi_mr_reg(rxr_ep_domain(ep)->rdm_domain,
+				read_entry->iov[i].iov_base, read_entry->iov[i].iov_len,
+				FI_RECV, 0, 0, 0, &read_entry->mr[i], NULL);
+
+		if (err) {
+			/* If registration failed with -FI_ENOMEM, we return -FI_EAGAIN.
+			 * This read entry will be put into a queue.
+			 *
+			 * The progress engine will progress other message transfers, which
+			 * will release registrations. Thus, when the progress engine call this
+			 * function again later, there will be registrations available.
+			 *
+			 * All registration opened here will be closed during release of
+			 * the read_entry.
+			 */
+			FI_WARN(&rxr_prov, FI_LOG_MR, "Unable to register MR buf for read!\n");
+			if (err == -FI_ENOMEM)
+				err = -FI_EAGAIN;
+			return err;
+		}
+
+		read_entry->mr_desc[i] = fi_mr_desc(read_entry->mr[i]);
+	}
+
+	return 0;
+}
+
+/* rxr_read_alloc_entry allocates a read entry.
+ * It is called by rxr_read_post_or_queue().
+ * Input:
+ *   x_entry: can be a tx_entry or an rx_entry.
+ *            If x_entry is tx_entry, application called fi_read().
+ *            If x_entry is rx_entry, read message protocol is being used.
+ *   lower_ep_type: EFA_EP or SHM_EP
+ * Return:
+ *   On success, return the pointer of allocated read_entry
+ *   Otherwise, return NULL
+ */
 struct rxr_read_entry *rxr_read_alloc_entry(struct rxr_ep *ep, int entry_type, void *x_entry,
 					    enum rxr_lower_ep_type lower_ep_type)
 {
 	struct rxr_tx_entry *tx_entry = NULL;
 	struct rxr_rx_entry *rx_entry = NULL;
 	struct rxr_read_entry *read_entry;
-	int i, err;
+	int i;
 	size_t total_iov_len, total_rma_iov_len;
-	void **mr_desc;
 
 	read_entry = ofi_buf_alloc(ep->read_entry_pool);
 	if (OFI_UNLIKELY(!read_entry)) {
@@ -97,79 +207,63 @@ struct rxr_read_entry *rxr_read_alloc_entry(struct rxr_ep *ep, int entry_type, v
 
 	read_entry->read_id = ofi_buf_index(read_entry);
 	read_entry->state = RXR_RDMA_ENTRY_CREATED;
-	read_entry->x_entry_type = entry_type;
 
 	if (entry_type == RXR_TX_ENTRY) {
 		tx_entry = (struct rxr_tx_entry *)x_entry;
 		assert(tx_entry->op == ofi_op_read_req);
-		read_entry->x_entry_id = tx_entry->tx_id;
+
+		read_entry->context_type = RXR_READ_CONTEXT_TX_ENTRY;
+		read_entry->context = tx_entry;
 		read_entry->addr = tx_entry->addr;
 
 		read_entry->iov_count = tx_entry->iov_count;
-		read_entry->iov = tx_entry->iov;
+		memcpy(read_entry->iov, tx_entry->iov,
+		       tx_entry->iov_count * sizeof(struct iovec));
 
 		read_entry->rma_iov_count = tx_entry->rma_iov_count;
-		read_entry->rma_iov = tx_entry->rma_iov;
+		memcpy(read_entry->rma_iov, tx_entry->rma_iov,
+		       tx_entry->rma_iov_count * sizeof(struct fi_rma_iov));
 
 		total_iov_len = ofi_total_iov_len(tx_entry->iov, tx_entry->iov_count);
 		total_rma_iov_len = ofi_total_rma_iov_len(tx_entry->rma_iov, tx_entry->rma_iov_count);
 		read_entry->total_len = MIN(total_iov_len, total_rma_iov_len);
-		mr_desc = tx_entry->desc;
+
+		if (tx_entry->desc) {
+			memcpy(read_entry->mr_desc, tx_entry->desc,
+			       read_entry->iov_count * sizeof(void *));
+		}
+
 	} else {
 		rx_entry = (struct rxr_rx_entry *)x_entry;
 		assert(rx_entry->op == ofi_op_write || rx_entry->op == ofi_op_msg ||
 		       rx_entry->op == ofi_op_tagged);
 
-		read_entry->x_entry_id = rx_entry->rx_id;
+		read_entry->context_type = RXR_READ_CONTEXT_RX_ENTRY;
+		read_entry->context = rx_entry;
 		read_entry->addr = rx_entry->addr;
 
 		read_entry->iov_count = rx_entry->iov_count;
-		read_entry->iov = rx_entry->iov;
+		memcpy(read_entry->iov, rx_entry->iov,
+		       rx_entry->iov_count * sizeof(struct iovec));
 
 		read_entry->rma_iov_count = rx_entry->rma_iov_count;
-		read_entry->rma_iov = rx_entry->rma_iov;
+		memcpy(read_entry->rma_iov, rx_entry->rma_iov,
+		       rx_entry->rma_iov_count * sizeof(struct fi_rma_iov));
 
-		mr_desc = rx_entry->desc;
 		total_iov_len = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count);
 		total_rma_iov_len = ofi_total_rma_iov_len(rx_entry->rma_iov, rx_entry->rma_iov_count);
 		read_entry->total_len = MIN(total_iov_len, total_rma_iov_len);
+
+		if (rx_entry->desc) {
+			memcpy(read_entry->mr_desc, rx_entry->desc,
+			       read_entry->iov_count * sizeof(void *));
+		}
 	}
 
-	if (lower_ep_type == EFA_EP) {
-		/* EFA provider need local buffer registration */
-		for (i = 0; i < read_entry->iov_count; ++i) {
-			if (mr_desc && mr_desc[i]) {
-				read_entry->mr[i] = NULL;
-				read_entry->mr_desc[i] = mr_desc[i];
-			} else {
-				err = fi_mr_reg(rxr_ep_domain(ep)->rdm_domain,
-						read_entry->iov[i].iov_base, read_entry->iov[i].iov_len,
-						FI_RECV, 0, 0, 0, &read_entry->mr[i], NULL);
-
-				if (err == -FI_ENOMEM && efa_mr_cache_enable) {
-					/* In this case, we will try registration one more time because
-					 * mr cache will try to release MR when encountered error
-					 */
-					FI_WARN(&rxr_prov, FI_LOG_MR, "Unable to register MR buf for FI_ENOMEM!\n");
-					FI_WARN(&rxr_prov, FI_LOG_MR, "Try again because MR cache will try release to release unused MR entry.\n");
-					err = fi_mr_reg(rxr_ep_domain(ep)->rdm_domain,
-							read_entry->iov[i].iov_base, read_entry->iov[i].iov_len,
-							FI_RECV, 0, 0, 0, &read_entry->mr[i], NULL);
-					if (!err)
-						FI_WARN(&rxr_prov, FI_LOG_MR, "The 2nd attemp was successful!");
-				}
+	memset(read_entry->mr, 0, read_entry->iov_count * sizeof(struct fid_mr *));
 
-				if (err) {
-					FI_WARN(&rxr_prov, FI_LOG_MR, "Unable to register MR buf\n");
-					return NULL;
-				}
-
-				read_entry->mr_desc[i] = fi_mr_desc(read_entry->mr[i]);
-			}
-		}
-	} else {
+	if (lower_ep_type == SHM_EP) {
 		assert(lower_ep_type == SHM_EP);
-		memset(read_entry->mr, 0, read_entry->iov_count * sizeof(struct fid_mr *));
 		/* FI_MR_VIRT_ADDR is not being set, use 0-based offset instead. */
 		if (!(shm_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR)) {
 			for (i = 0; i < read_entry->rma_iov_count; ++i)
@@ -204,11 +298,30 @@ void rxr_read_release_entry(struct rxr_ep *ep, struct rxr_read_entry *read_entry
 	ofi_buf_free(read_entry);
 }
 
-int rxr_read_post_or_queue(struct rxr_ep *ep, int entry_type, void *x_entry)
+static inline
+int rxr_read_post_or_queue(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
+{
+	int err;
+
+	err = rxr_read_post(ep, read_entry);
+	if (err == -FI_EAGAIN) {
+		dlist_insert_tail(&read_entry->pending_entry, &ep->read_pending_list);
+		read_entry->state = RXR_RDMA_ENTRY_PENDING;
+		err = 0;
+	} else if(err) {
+		rxr_read_release_entry(ep, read_entry);
+		FI_WARN(&rxr_prov, FI_LOG_CQ,
+			"RDMA post read failed. errno=%d.\n", err);
+	}
+
+	return err;
+}
+
+int rxr_read_post_remote_read_or_queue(struct rxr_ep *ep, int entry_type, void *x_entry)
 {
 	struct rxr_peer *peer;
 	struct rxr_read_entry *read_entry;
-	int err, lower_ep_type;
+	int lower_ep_type;
 
 	if (entry_type == RXR_TX_ENTRY) {
 		peer = rxr_ep_get_peer(ep, ((struct rxr_tx_entry *)x_entry)->addr);
@@ -226,18 +339,66 @@ int rxr_read_post_or_queue(struct rxr_ep *ep, int entry_type, void *x_entry)
 		return -FI_ENOBUFS;
 	}
 
-	err = rxr_read_post(ep, read_entry);
-	if (err == -FI_EAGAIN) {
-		dlist_insert_tail(&read_entry->pending_entry, &ep->read_pending_list);
-		read_entry->state = RXR_RDMA_ENTRY_PENDING;
-		err = 0;
-	} else if(err) {
-		rxr_read_release_entry(ep, read_entry);
+	return rxr_read_post_or_queue(ep, read_entry);
+}
+
+int rxr_read_post_local_read_or_queue(struct rxr_ep *ep,
+				      struct rxr_rx_entry *rx_entry,
+				      size_t data_offset,
+				      struct rxr_pkt_entry *pkt_entry,
+				      char *data, size_t data_size)
+{
+	int err;
+	struct rxr_read_entry *read_entry;
+
+	read_entry = ofi_buf_alloc(ep->read_entry_pool);
+	if (!read_entry) {
+		FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "RDMA entries exhausted\n");
+		return -FI_ENOBUFS;
+	}
+
+	read_entry->read_id = ofi_buf_index(read_entry);
+	read_entry->lower_ep_type = EFA_EP;
+	read_entry->context_type = RXR_READ_CONTEXT_PKT_ENTRY;
+	read_entry->context = pkt_entry;
+	read_entry->state = RXR_RDMA_ENTRY_CREATED;
+	read_entry->addr = FI_ADDR_NOTAVAIL;
+	read_entry->total_len = data_size;
+	read_entry->bytes_submitted = 0;
+	read_entry->bytes_finished = 0;
+
+	/* setup rma_iov */
+	read_entry->rma_iov_count = 1;
+	read_entry->rma_iov[0].addr = (uint64_t)data;
+	read_entry->rma_iov[0].len = data_size;
+	read_entry->rma_iov[0].key = (pkt_entry->mr) ? fi_mr_key(pkt_entry->mr) : 0;
+
+	/* setup iov */
+	assert(pkt_entry->x_entry == rx_entry);
+	assert(rx_entry->desc && efa_ep_is_cuda_mr(rx_entry->desc[0]));
+	read_entry->iov_count = rx_entry->iov_count;
+	memcpy(read_entry->iov, rx_entry->iov, rx_entry->iov_count * sizeof(struct iovec));
+	memcpy(read_entry->mr_desc, rx_entry->desc, rx_entry->iov_count * sizeof(void *));
+	ofi_consume_iov_desc(read_entry->iov, read_entry->mr_desc, &read_entry->iov_count, data_offset);
+	if (read_entry->iov_count == 0) {
 		FI_WARN(&rxr_prov, FI_LOG_CQ,
-			"RDMA post read failed. errno=%d.\n", err);
+			"data_offset %ld out of range\n",
+			data_offset);
+		ofi_buf_free(read_entry);
+		return -FI_ETRUNC;
 	}
 
-	return err;
+	assert(efa_ep_is_cuda_mr(read_entry->mr_desc[0]));
+	err = ofi_truncate_iov(read_entry->iov, &read_entry->iov_count, data_size);
+	if (err) {
+		FI_WARN(&rxr_prov, FI_LOG_CQ,
+			"data_offset %ld data_size %ld out of range\n",
+			data_offset, data_size);
+		ofi_buf_free(read_entry);
+		return -FI_ETRUNC;
+	}
+
+	return rxr_read_post_or_queue(ep, read_entry);
 }
 
 int rxr_read_init_iov(struct rxr_ep *ep,
@@ -298,29 +459,41 @@ int rxr_read_post(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
 {
 	int ret;
 	int iov_idx = 0, rma_iov_idx = 0;
-	void *iov_ptr, *rma_iov_ptr;
-	struct rxr_peer *peer;
-	struct rxr_pkt_entry *pkt_entry;
+	bool self_comm;
 	size_t iov_offset = 0, rma_iov_offset = 0;
-	size_t total_iov_len, total_rma_iov_len;
-	size_t segsize, max_iov_segsize, max_rma_iov_segsize, max_read_size;
-	struct fid_ep *lower_ep;
-	fi_addr_t lower_ep_addr;
+	size_t total_iov_len, total_rma_iov_len, max_read_size;
+	struct rxr_pkt_entry *pkt_entry;
+	struct iovec iov;
+	struct fi_rma_iov rma_iov;
+	struct fi_msg_rma msg;
+	struct efa_ep *efa_ep;
+	struct rxr_peer *peer;
+	fi_addr_t shm_fiaddr;
 
 	assert(read_entry->iov_count > 0);
 	assert(read_entry->rma_iov_count > 0);
 	assert(read_entry->bytes_submitted < read_entry->total_len);
 
-	peer = rxr_ep_get_peer(ep, read_entry->addr);
+	if (read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY) {
+		assert(read_entry->lower_ep_type == EFA_EP);
+		ret = rxr_read_prepare_pkt_entry_mr(ep, read_entry);
+		if (ret)
+			return ret;
+	}
+
 	if (read_entry->lower_ep_type == EFA_EP) {
-		max_read_size = efa_max_rdma_size(ep->rdm_ep);
-		lower_ep = ep->rdm_ep;
-		lower_ep_addr = read_entry->addr;
-	} else {
-		max_read_size = SIZE_MAX;
-		lower_ep = ep->shm_ep;
-		lower_ep_addr = peer->shm_fiaddr;
+		ret = rxr_read_mr_reg(ep, read_entry);
+		if (ret)
+			return ret;
 	}
+
+	peer = rxr_ep_get_peer(ep, read_entry->addr);
+	assert(peer);
+	if (read_entry->lower_ep_type == SHM_EP)
+		shm_fiaddr = peer->shm_fiaddr;
+
+	max_read_size = (read_entry->lower_ep_type == EFA_EP) ?
+				efa_max_rdma_size(ep->rdm_ep) : SIZE_MAX;
 	assert(max_read_size > 0);
 
 	ret = rxr_locate_iov_pos(read_entry->iov, read_entry->iov_count,
@@ -338,20 +511,27 @@ int rxr_read_post(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
 	assert(read_entry->total_len == MIN(total_iov_len, total_rma_iov_len));
 
 	while (read_entry->bytes_submitted < read_entry->total_len) {
+
+		if (ep->tx_pending == ep->max_outstanding_tx)
+			return -FI_EAGAIN;
+
 		assert(iov_idx < read_entry->iov_count);
 		assert(iov_offset < read_entry->iov[iov_idx].iov_len);
 		assert(rma_iov_idx < read_entry->rma_iov_count);
 		assert(rma_iov_offset < read_entry->rma_iov[rma_iov_idx].len);
 
-		iov_ptr = (char *)read_entry->iov[iov_idx].iov_base + iov_offset;
-		rma_iov_ptr = (char *)read_entry->rma_iov[rma_iov_idx].addr + rma_iov_offset;
+		iov.iov_base = (char *)read_entry->iov[iov_idx].iov_base + iov_offset;
+		iov.iov_len = read_entry->iov[iov_idx].iov_len - iov_offset;
 
-		max_iov_segsize = read_entry->iov[iov_idx].iov_len - iov_offset;
-		max_rma_iov_segsize = read_entry->rma_iov[rma_iov_idx].len - rma_iov_offset;
-		segsize = MIN(max_iov_segsize, max_rma_iov_segsize);
+		rma_iov.addr = (uintptr_t)read_entry->rma_iov[rma_iov_idx].addr + rma_iov_offset;
+		rma_iov.len = read_entry->rma_iov[rma_iov_idx].len - rma_iov_offset;
+		rma_iov.key = read_entry->rma_iov[rma_iov_idx].key;
+
+		iov.iov_len = MIN(iov.iov_len, rma_iov.len);
 		if (read_entry->lower_ep_type == EFA_EP)
-			segsize = MIN(segsize, rxr_env.efa_read_segment_size);
-		segsize = MIN(segsize, max_read_size);
+			iov.iov_len = MIN(iov.iov_len, rxr_env.efa_read_segment_size);
+		iov.iov_len = MIN(iov.iov_len, max_read_size);
+		rma_iov.len = iov.iov_len;
 
 		/* because fi_send uses a pkt_entry as context
 		 * we had to use a pkt_entry as context too
@@ -364,31 +544,49 @@ int rxr_read_post(struct rxr_ep *ep, struct rxr_read_entry *read_entry)
 		if (OFI_UNLIKELY(!pkt_entry))
 			return -FI_EAGAIN;
 
-		rxr_pkt_init_read_context(ep, read_entry, segsize, pkt_entry);
-
-		ret = fi_read(lower_ep,
-			      iov_ptr, segsize, read_entry->mr_desc[iov_idx],
-			      lower_ep_addr,
-			      (uint64_t)rma_iov_ptr, read_entry->rma_iov[rma_iov_idx].key,
-			      pkt_entry);
+		rxr_pkt_init_read_context(ep, read_entry, iov.iov_len, pkt_entry);
+
+		memset(&msg, 0, sizeof(msg));
+		msg.msg_iov = &iov;
+		msg.desc = &read_entry->mr_desc[iov_idx];
+		msg.iov_count = 1;
+		msg.rma_iov = &rma_iov;
+		msg.rma_iov_count = 1;
+		msg.context = pkt_entry;
+
+		if (read_entry->lower_ep_type == SHM_EP) {
+			msg.addr = shm_fiaddr;
+			ret = fi_readmsg(ep->shm_ep, &msg, 0);
+		} else {
+			efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid);
+			msg.addr = read_entry->addr;
+			self_comm = (read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY);
+			ret = efa_rma_post_read(efa_ep, &msg, 0, self_comm);
+		}
 
 		if (OFI_UNLIKELY(ret)) {
 			rxr_pkt_entry_release_tx(ep, pkt_entry);
 			return ret;
 		}
 
-		if (!peer->is_local)
+		if (read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY) {
+			assert(read_entry->lower_ep_type == EFA_EP);
+			/* read from self, no peer */
+			ep->tx_pending++;
+		} else if (read_entry->lower_ep_type == EFA_EP) {
 			rxr_ep_inc_tx_pending(ep, peer);
-		read_entry->bytes_submitted += segsize;
+		}
+
+		read_entry->bytes_submitted += iov.iov_len;
 
-		iov_offset += segsize;
+		iov_offset += iov.iov_len;
 		assert(iov_offset <= read_entry->iov[iov_idx].iov_len);
 		if (iov_offset == read_entry->iov[iov_idx].iov_len) {
 			iov_idx += 1;
 			iov_offset = 0;
 		}
 
-		rma_iov_offset += segsize;
+		rma_iov_offset += rma_iov.len;
 		assert(rma_iov_offset <= read_entry->rma_iov[rma_iov_idx].len);
 		if (rma_iov_offset == read_entry->rma_iov[rma_iov_idx].len) {
 			rma_iov_idx += 1;
@@ -414,12 +612,12 @@ int rxr_read_handle_error(struct rxr_ep *ep, struct rxr_read_entry *read_entry,
 	struct rxr_tx_entry *tx_entry;
 	struct rxr_rx_entry *rx_entry;
 
-	if (read_entry->x_entry_type == RXR_TX_ENTRY) {
-		tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, read_entry->x_entry_id);
+	if (read_entry->context_type == RXR_READ_CONTEXT_TX_ENTRY) {
+		tx_entry = read_entry->context;
 		ret = rxr_cq_handle_tx_error(ep, tx_entry, ret);
 	} else {
-		assert(read_entry->x_entry_type == RXR_RX_ENTRY);
-		rx_entry = ofi_bufpool_get_ibuf(ep->rx_entry_pool, read_entry->x_entry_id);
+		assert(read_entry->context_type == RXR_READ_CONTEXT_RX_ENTRY);
+		rx_entry = read_entry->context;
 		ret = rxr_cq_handle_rx_error(ep, rx_entry, ret);
 	}
 
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_read.h b/deps/libfabric/prov/efa/src/rxr/rxr_read.h
index 3d69374034b3bcae9b7039ad0801314d1740cf38..33b55c0eac9238664f9bdfb99ad1e8f684df3ba7 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_read.h
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_read.h
@@ -38,32 +38,59 @@
 #ifndef _RXR_RDMA_H_
 #define _RXR_RDMA_H_
 
+/*
+ * read can used in 2 scenarios:
+ *
+ * 1. application posted a read request.
+ *
+ * 2. read message protocol is being used, receiver is going
+ *    to post a read requst.
+ *
+ * 3. a packet entry with data has been received, and the
+ *    receiving buffer is on GPU memroy. A read request is
+ *    being posted to copy data to receiving buffer.
+ *
+ * To distinguish them, we use a pointer as context.
+ *
+ * For 1, the tx_entry is used as context
+ * For 2, the rx_entry is used as context
+ * For 3, the pkt_entry is used as context
+ *
+ * We also store rxr_read_context_type in read_entry to specify
+ * context type.
+ */
+enum rxr_read_context_type {
+	RXR_READ_CONTEXT_TX_ENTRY,
+	RXR_READ_CONTEXT_RX_ENTRY,
+	RXR_READ_CONTEXT_PKT_ENTRY,
+};
+
 enum rxr_read_entry_state {
 	RXR_RDMA_ENTRY_FREE = 0,
 	RXR_RDMA_ENTRY_CREATED,
 	RXR_RDMA_ENTRY_PENDING
 };
 
-/* rxr_read_entry was arranged as a packet
- * and was put in a rxr_pkt_entry. Because rxr_pkt_entry is used
- * as context.
+/*
+ * rxr_read_entry contains the information of a read request
  */
 struct rxr_read_entry {
 	int read_id;
 	enum rxr_lower_ep_type lower_ep_type;
 
-	enum rxr_x_entry_type x_entry_type;
-	int x_entry_id;
+	void *context;
+	enum rxr_read_context_type context_type;
+
 	enum rxr_read_entry_state state;
 
 	fi_addr_t addr;
 
-	struct iovec *iov;
+	struct iovec iov[RXR_IOV_LIMIT];
 	size_t iov_count;
 	struct fid_mr *mr[RXR_IOV_LIMIT];
 	void *mr_desc[RXR_IOV_LIMIT];
 
-	struct fi_rma_iov *rma_iov;
+	struct fi_rma_iov rma_iov[RXR_IOV_LIMIT];
 	size_t rma_iov_count;
 
 	size_t total_len;
@@ -88,7 +115,13 @@ int rxr_read_init_iov(struct rxr_ep *ep,
 
 int rxr_read_post(struct rxr_ep *ep, struct rxr_read_entry *read_entry);
 
-int rxr_read_post_or_queue(struct rxr_ep *ep, int entry_type, void *x_entry);
+int rxr_read_post_remote_read_or_queue(struct rxr_ep *ep, int entry_type, void *x_entry);
+
+int rxr_read_post_local_read_or_queue(struct rxr_ep *ep,
+				      struct rxr_rx_entry *rx_entry,
+				      size_t data_offset,
+				      struct rxr_pkt_entry *pkt_entry,
+				      char *data, size_t data_size);
 
 void rxr_read_handle_read_completion(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry);
 
diff --git a/deps/libfabric/prov/efa/src/rxr/rxr_rma.c b/deps/libfabric/prov/efa/src/rxr/rxr_rma.c
index fe104b10f4b12a5f4beac3286e93a7c7460dd42c..f25569b50d57fa144143a6c3c2316f165a981a65 100644
--- a/deps/libfabric/prov/efa/src/rxr/rxr_rma.c
+++ b/deps/libfabric/prov/efa/src/rxr/rxr_rma.c
@@ -299,13 +299,16 @@ ssize_t rxr_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_
 	rxr_perfset_start(rxr_ep, perf_rxr_tx);
 	fastlock_acquire(&rxr_ep->util_ep.lock);
 
-	if (OFI_UNLIKELY(is_tx_res_full(rxr_ep)))
-		return -FI_EAGAIN;
+	if (OFI_UNLIKELY(is_tx_res_full(rxr_ep))) {
+		err = -FI_EAGAIN;
+		goto out;
+	}
 
 	tx_entry = rxr_rma_alloc_tx_entry(rxr_ep, msg, ofi_op_read_req, flags);
 	if (OFI_UNLIKELY(!tx_entry)) {
 		rxr_ep_progress_internal(rxr_ep);
-		return -FI_EAGAIN;
+		err = -FI_EAGAIN;
+		goto out;
 	}
 
 	peer = rxr_ep_get_peer(rxr_ep, msg->addr);
@@ -323,7 +326,7 @@ ssize_t rxr_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_
 	}
 
 	if (use_lower_ep_read) {
-		err = rxr_read_post_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry);
+		err = rxr_read_post_remote_read_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry);
 		if (OFI_UNLIKELY(err == -FI_ENOBUFS)) {
 			rxr_release_tx_entry(rxr_ep, tx_entry);
 			err = -FI_EAGAIN;
@@ -385,6 +388,11 @@ ssize_t rxr_rma_post_write(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry)
 {
 	ssize_t err;
 	struct rxr_peer *peer;
+	struct efa_domain *efa_domain;
+	struct rxr_domain *rxr_domain = rxr_ep_domain(ep);
+
+	efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain,
+				  util_domain.domain_fid);
 
 	peer = rxr_ep_get_peer(ep, tx_entry->addr);
 	assert(peer);
@@ -397,7 +405,7 @@ ssize_t rxr_rma_post_write(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry)
 
 	if (tx_entry->total_len >= rxr_env.efa_min_read_write_size &&
 	    efa_both_support_rdma_read(ep, peer) &&
-	    (tx_entry->desc[0] || efa_mr_cache_enable)) {
+	    (tx_entry->desc[0] || efa_is_cache_available(efa_domain))) {
 		err = rxr_pkt_post_ctrl_or_queue(ep, RXR_TX_ENTRY, tx_entry, RXR_READ_RTW_PKT, 0);
 		if (err != -FI_ENOMEM)
 			return err;
@@ -499,7 +507,7 @@ ssize_t rxr_rma_writedata(struct fid_ep *ep, const void *buf, size_t len,
 
 	memset(&msg, 0, sizeof(msg));
 	msg.msg_iov = &iov;
-	msg.desc = desc;
+	msg.desc = &desc;
 	msg.iov_count = 1;
 	msg.addr = dest_addr;
 	msg.context = context;
diff --git a/deps/libfabric/prov/hook/hook_debug/src/hook_debug.c b/deps/libfabric/prov/hook/hook_debug/src/hook_debug.c
index f3f43ce80d1a91010a3eae0e64b9f5f68b3a504b..ba5cb77d4678cd740665a55febe0b58cb9f489af 100644
--- a/deps/libfabric/prov/hook/hook_debug/src/hook_debug.c
+++ b/deps/libfabric/prov/hook/hook_debug/src/hook_debug.c
@@ -863,8 +863,10 @@ int hook_debug_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr,
 		return -FI_ENOMEM;
 
 	ret = hook_eq_init(fabric, attr, eq, context, &myeq->hook_eq);
-	if (ret)
+	if (ret) {
 		free(myeq);
+		return ret;
+	}
 
 	myeq->hook_eq.eq.ops = &hook_debug_eq_ops;
 	myeq->hook_eq.eq.fid.ops = &hook_debug_eq_fid_ops;
diff --git a/deps/libfabric/prov/hook/src/hook_domain.c b/deps/libfabric/prov/hook/src/hook_domain.c
index 07d23cb1cc38e656cb0ac7013f062b9f027d628a..004e19ee6ec8fc72232fd8753ee511eea66b5237 100644
--- a/deps/libfabric/prov/hook/src/hook_domain.c
+++ b/deps/libfabric/prov/hook/src/hook_domain.c
@@ -77,6 +77,7 @@ static int hook_mr_regv(struct fid *fid, const struct iovec *iov,
 	attr.context = context;
 	attr.auth_key_size = 0;
 	attr.auth_key = NULL;
+	attr.iface = FI_HMEM_SYSTEM;
 
 	return hook_mr_regattr(fid, &attr, flags, mr);
 }
diff --git a/deps/libfabric/prov/mrail/src/mrail_av.c b/deps/libfabric/prov/mrail/src/mrail_av.c
index af2d7a91db9bda7f193af62055ea41fb3aad8149..f4d53ae29c4c7897b887195fce829ceadb0925f8 100644
--- a/deps/libfabric/prov/mrail/src/mrail_av.c
+++ b/deps/libfabric/prov/mrail/src/mrail_av.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Intel Corporation, Inc.  All rights reserved.
+ * Copyright (c) 2018-2020 Intel Corporation, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -141,7 +141,6 @@ static int mrail_av_insert(struct fid_av *av_fid, const void *addr, size_t count
 		if (ret) {
 			FI_WARN(&mrail_prov, FI_LOG_AV, \
 				"Unable to get rail fi_addr\n");
-			index = FI_ADDR_NOTAVAIL;
 		} else {
 			assert(index == index_rail0);
 			num_inserted++;
@@ -195,6 +194,7 @@ int mrail_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 	util_attr.addrlen = sizeof(struct mrail_peer_info);
 	/* We just need a table to store the mapping */
 	util_attr.flags = 0;
+	util_attr.context_len = 0;
 
 	if (attr->type == FI_AV_UNSPEC)
 		attr->type = FI_AV_TABLE;
diff --git a/deps/libfabric/prov/mrail/src/mrail_cq.c b/deps/libfabric/prov/mrail/src/mrail_cq.c
index 481ca14ffeec08cdf832d800eff3166fdabe9e54..4aab096eb707f03039353bf5777b13aa18804f54 100644
--- a/deps/libfabric/prov/mrail/src/mrail_cq.c
+++ b/deps/libfabric/prov/mrail/src/mrail_cq.c
@@ -387,6 +387,7 @@ static void mrail_save_ooo_recv(struct mrail_ep *mrail_ep,
 	if (!ooo_recv) {
 		FI_WARN(&mrail_prov, FI_LOG_CQ, "Cannot allocate ooo_recv\n");
 		assert(0);
+		return;
 	}
 	ooo_recv->entry.next = NULL;
 	ooo_recv->seq_no = seq_no;
diff --git a/deps/libfabric/prov/psm2/src/psmx2.h b/deps/libfabric/prov/psm2/src/psmx2.h
index 77cd9bdb3b6f5336bb14181f034ccc617cbeba9a..48ca3aca9ca6d326d1da7475cc561602db720f20 100644
--- a/deps/libfabric/prov/psm2/src/psmx2.h
+++ b/deps/libfabric/prov/psm2/src/psmx2.h
@@ -628,7 +628,8 @@ struct psmx2_cq_event {
 		struct fi_cq_err_entry		err;
 	} cqe;
 	int			error;
-	int			source_is_valid;
+	int8_t			source_is_valid;
+	uint8_t			source_sep_id;
 	psm2_epaddr_t		source;
 	struct psmx2_fid_av	*source_av;
 	struct slist_entry	list_entry;
@@ -766,6 +767,7 @@ struct psmx2_fid_ep {
 	size_t			min_multi_recv;
 	uint32_t		iov_seq_num;
 	int			service;
+	int			sep_id;
 };
 
 struct psmx2_sep_ctxt {
@@ -1021,7 +1023,7 @@ struct	psmx2_cq_event *psmx2_cq_create_event(struct psmx2_fid_cq *cq,
 int	psmx2_cq_poll_mq(struct psmx2_fid_cq *cq, struct psmx2_trx_ctxt *trx_ctxt,
 			 struct psmx2_cq_event *event, int count, fi_addr_t *src_addr);
 
-int	psmx2_epid_to_epaddr(struct psmx2_trx_ctxt *trx_ctxt,
+void	psmx2_epid_to_epaddr(struct psmx2_trx_ctxt *trx_ctxt,
 			     psm2_epid_t epid, psm2_epaddr_t *epaddr);
 
 int	psmx2_av_add_trx_ctxt(struct psmx2_fid_av *av, struct psmx2_trx_ctxt *trx_ctxt);
@@ -1041,7 +1043,6 @@ psm2_epaddr_t psmx2_av_translate_addr(struct psmx2_fid_av *av,
 	psm2_epaddr_t epaddr;
 	size_t idx;
 	int ctxt;
-	int err;
 
 	if (av_type == FI_AV_MAP)
 		return (psm2_epaddr_t) addr;
@@ -1066,25 +1067,18 @@ psm2_epaddr_t psmx2_av_translate_addr(struct psmx2_fid_av *av,
 		ctxt = PSMX2_ADDR_CTXT(addr, av->rx_ctx_bits);
 		assert(ctxt < av->sep_info[idx].ctxt_cnt);
 
-		if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt])) {
-			err = psmx2_epid_to_epaddr(trx_ctxt,
-						   av->sep_info[idx].epids[ctxt],
-						   &av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt]);
-			assert(!err);
-		}
+		if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt]))
+			 psmx2_epid_to_epaddr(trx_ctxt,
+					      av->sep_info[idx].epids[ctxt],
+					      &av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt]);
 		epaddr = av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt];
 	} else {
-		if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].epaddrs[idx])) {
-			err = psmx2_epid_to_epaddr(trx_ctxt, av->table[idx].epid,
-						   &av->conn_info[trx_ctxt->id].epaddrs[idx]);
-			assert(!err);
-		}
+		if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].epaddrs[idx]))
+			psmx2_epid_to_epaddr(trx_ctxt, av->table[idx].epid,
+					     &av->conn_info[trx_ctxt->id].epaddrs[idx]);
 		epaddr = av->conn_info[trx_ctxt->id].epaddrs[idx];
 	}
 
-#ifdef NDEBUG
-	(void) err;
-#endif
 	av->domain->av_unlock_fn(&av->lock, 1);
 	return epaddr;
 }
@@ -1158,22 +1152,29 @@ static inline void psmx2_cntr_inc(struct psmx2_fid_cntr *cntr, int error)
 		cntr->wait->signal(cntr->wait);
 }
 
-fi_addr_t psmx2_av_translate_source(struct psmx2_fid_av *av, psm2_epaddr_t source);
+fi_addr_t psmx2_av_translate_source(struct psmx2_fid_av *av,
+				    psm2_epaddr_t source, int source_sep_id);
 
-static inline void psmx2_get_source_name(psm2_epaddr_t source, struct psmx2_ep_name *name)
+static inline void psmx2_get_source_name(psm2_epaddr_t source,
+					 int source_sep_id,
+					 struct psmx2_ep_name *name)
 {
 	memset(name, 0, sizeof(*name));
 	psm2_epaddr_to_epid(source, &name->epid);
-	name->type = PSMX2_EP_REGULAR;
+	name->sep_id = source_sep_id;
+	name->type = source_sep_id ? PSMX2_EP_SCALABLE : PSMX2_EP_REGULAR;
 }
 
-static inline void psmx2_get_source_string_name(psm2_epaddr_t source, char *name, size_t *len)
+static inline void psmx2_get_source_string_name(psm2_epaddr_t source,
+						int source_sep_id,
+						char *name, size_t *len)
 {
 	struct psmx2_ep_name ep_name;
 
 	memset(&ep_name, 0, sizeof(ep_name));
 	psm2_epaddr_to_epid(source, &ep_name.epid);
-	ep_name.type = PSMX2_EP_REGULAR;
+	ep_name.sep_id = source_sep_id;
+	ep_name.type = source_sep_id ? PSMX2_EP_SCALABLE : PSMX2_EP_REGULAR;
 
 	ofi_straddr(name, len, FI_ADDR_PSMX2, &ep_name);
 }
diff --git a/deps/libfabric/prov/psm2/src/psmx2_atomic.c b/deps/libfabric/prov/psm2/src/psmx2_atomic.c
index 576e922f3f4c45bef491ac7261942a03d0e73633..ba8a3876a6c9371b88bbcdf4d811ac454764b329 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_atomic.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_atomic.c
@@ -808,6 +808,7 @@ ssize_t psmx2_atomic_write_generic(struct fid_ep *ep,
 	psm2_epid_t psm2_epid;
 	int am_flags = PSM2_AM_FLAG_ASYNC;
 	int chunk_size, len;
+	int err;
 
 	ep_priv = container_of(ep, struct psmx2_fid_ep, ep);
 
@@ -873,9 +874,15 @@ ssize_t psmx2_atomic_write_generic(struct fid_ep *ep,
 	args[3].u64 = key;
 	args[4].u32w0 = datatype;
 	args[4].u32w1 = op;
-	psm2_am_request_short(psm2_epaddr,
-			      PSMX2_AM_ATOMIC_HANDLER, args, 5,
-			      (void *)buf, len, am_flags, NULL, NULL);
+	err = psm2_am_request_short(psm2_epaddr,
+				    PSMX2_AM_ATOMIC_HANDLER, args, 5,
+				    (void *)buf, len, am_flags, NULL, NULL);
+	if (err) {
+		free(req->tmpbuf);
+		psmx2_am_request_free(ep_priv->tx, req);
+		return psmx2_errno(err);
+	}
+
 	psmx2_am_poll(ep_priv->tx);
 	return 0;
 }
@@ -982,9 +989,15 @@ ssize_t psmx2_atomic_writev_generic(struct fid_ep *ep,
 	args[3].u64 = key;
 	args[4].u32w0 = datatype;
 	args[4].u32w1 = op;
-	psm2_am_request_short(psm2_epaddr,
-			      PSMX2_AM_ATOMIC_HANDLER, args, 5,
-			      (void *)buf, len, am_flags, NULL, NULL);
+	err = psm2_am_request_short(psm2_epaddr,
+				    PSMX2_AM_ATOMIC_HANDLER, args, 5,
+				    (void *)buf, len, am_flags, NULL, NULL);
+	if (err) {
+		free(req->tmpbuf);
+		psmx2_am_request_free(ep_priv->tx, req);
+		return psmx2_errno(err);
+	}
+
 	psmx2_am_poll(ep_priv->tx);
 	return 0;
 }
@@ -1097,6 +1110,7 @@ ssize_t psmx2_atomic_readwrite_generic(struct fid_ep *ep,
 	psm2_epid_t psm2_epid;
 	int am_flags = PSM2_AM_FLAG_ASYNC;
 	int chunk_size, len;
+	int err;
 
 	ep_priv = container_of(ep, struct psmx2_fid_ep, ep);
 
@@ -1168,9 +1182,16 @@ ssize_t psmx2_atomic_readwrite_generic(struct fid_ep *ep,
 	args[3].u64 = key;
 	args[4].u32w0 = datatype;
 	args[4].u32w1 = op;
-	psm2_am_request_short(psm2_epaddr,
-			      PSMX2_AM_ATOMIC_HANDLER, args, 5,
-			      (void *)buf, (buf?len:0), am_flags, NULL, NULL);
+	err = psm2_am_request_short(psm2_epaddr,
+				    PSMX2_AM_ATOMIC_HANDLER, args, 5,
+				    (void *)buf, (buf?len:0), am_flags, NULL,
+				    NULL);
+	if (err) {
+		free(req->tmpbuf);
+		psmx2_am_request_free(ep_priv->tx, req);
+		return psmx2_errno(err);
+	}
+
 	psmx2_am_poll(ep_priv->tx);
 	return 0;
 }
@@ -1341,9 +1362,16 @@ ssize_t psmx2_atomic_readwritev_generic(struct fid_ep *ep,
 	args[3].u64 = key;
 	args[4].u32w0 = datatype;
 	args[4].u32w1 = op;
-	psm2_am_request_short(psm2_epaddr,
-			      PSMX2_AM_ATOMIC_HANDLER, args, 5,
-			      (void *)buf, (buf?len:0), am_flags, NULL, NULL);
+	err = psm2_am_request_short(psm2_epaddr,
+				    PSMX2_AM_ATOMIC_HANDLER, args, 5,
+				    (void *)buf, (buf?len:0), am_flags, NULL,
+				    NULL);
+	if (err) {
+		free(req->tmpbuf);
+		psmx2_am_request_free(ep_priv->tx, req);
+		return psmx2_errno(err);
+	}
+
 	psmx2_am_poll(ep_priv->tx);
 	return 0;
 }
@@ -1476,6 +1504,7 @@ ssize_t psmx2_atomic_compwrite_generic(struct fid_ep *ep,
 	psm2_epid_t psm2_epid;
 	int am_flags = PSM2_AM_FLAG_ASYNC;
 	int chunk_size, len;
+	int err;
 
 	ep_priv = container_of(ep, struct psmx2_fid_ep, ep);
 
@@ -1548,10 +1577,16 @@ ssize_t psmx2_atomic_compwrite_generic(struct fid_ep *ep,
 	args[3].u64 = key;
 	args[4].u32w0 = datatype;
 	args[4].u32w1 = op;
-	psm2_am_request_short(psm2_epaddr,
-			      PSMX2_AM_ATOMIC_HANDLER, args, 5,
-			      (void *)buf, len * 2, am_flags,
-			      NULL, NULL);
+	err = psm2_am_request_short(psm2_epaddr,
+				    PSMX2_AM_ATOMIC_HANDLER, args, 5,
+				    (void *)buf, len * 2, am_flags,
+				    NULL, NULL);
+	if (err) {
+		free(req->tmpbuf);
+		psmx2_am_request_free(ep_priv->tx, req);
+		return psmx2_errno(err);
+	}
+
 	psmx2_am_poll(ep_priv->tx);
 	return 0;
 }
@@ -1745,9 +1780,15 @@ ssize_t psmx2_atomic_compwritev_generic(struct fid_ep *ep,
 	args[3].u64 = key;
 	args[4].u32w0 = datatype;
 	args[4].u32w1 = op;
-	psm2_am_request_short(psm2_epaddr,
-			      PSMX2_AM_ATOMIC_HANDLER, args, 5,
-			      buf, len * 2, am_flags, NULL, NULL);
+	err = psm2_am_request_short(psm2_epaddr,
+				    PSMX2_AM_ATOMIC_HANDLER, args, 5,
+				    buf, len * 2, am_flags, NULL, NULL);
+	if (err) {
+		free(req->tmpbuf);
+		psmx2_am_request_free(ep_priv->tx, req);
+		return psmx2_errno(err);
+	}
+
 	psmx2_am_poll(ep_priv->tx);
 	return 0;
 }
diff --git a/deps/libfabric/prov/psm2/src/psmx2_av.c b/deps/libfabric/prov/psm2/src/psmx2_av.c
index eb7668e302520ebbce90178307cc040e7ced1e0b..d21040d7bb47f22c11c8864677bc64cbac7ba305 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_av.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_av.c
@@ -202,8 +202,8 @@ static void psmx2_set_epaddr_context(struct psmx2_trx_ctxt *trx_ctxt,
 	trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2);
 }
 
-int psmx2_epid_to_epaddr(struct psmx2_trx_ctxt *trx_ctxt,
-			 psm2_epid_t epid, psm2_epaddr_t *epaddr)
+void psmx2_epid_to_epaddr(struct psmx2_trx_ctxt *trx_ctxt,
+			  psm2_epid_t epid, psm2_epaddr_t *epaddr)
 {
 	int err;
 	psm2_error_t errors;
@@ -215,7 +215,7 @@ int psmx2_epid_to_epaddr(struct psmx2_trx_ctxt *trx_ctxt,
 		context = psm2_epaddr_getctxt(epconn.addr);
 		if (context && context->epid  == epid) {
 			*epaddr = epconn.addr;
-			return 0;
+			return;
 		}
 	}
 
@@ -223,13 +223,17 @@ int psmx2_epid_to_epaddr(struct psmx2_trx_ctxt *trx_ctxt,
 			      (int64_t) psmx2_env.conn_timeout * 1000000000LL);
 	if (err == PSM2_OK || err == PSM2_EPID_ALREADY_CONNECTED) {
 		psmx2_set_epaddr_context(trx_ctxt, epid, *epaddr);
-		return 0;
+		return;
 	}
 
-	FI_WARN(&psmx2_prov, FI_LOG_AV,
-		"psm2_ep_connect retured error %s, remote epid=%lx.\n",
-		psm2_error_get_string(err), epid);
-	return psmx2_errno(err);
+	/* call fi_log() directly to always generate the output */
+	fi_log(&psmx2_prov, FI_LOG_WARN, FI_LOG_AV, __func__, __LINE__,
+		"psm2_ep_connect retured error %s, remote epid=%lx."
+		"If it is a timeout error, try setting FI_PSM2_CONN_TIMEOUT "
+		"to a larger value (current: %d seconds).\n",
+		psm2_error_get_string(err), epid, psmx2_env.conn_timeout);
+
+	abort();
 }
 
 /*
@@ -335,11 +339,9 @@ int psmx2_av_query_sep(struct psmx2_fid_av *av,
 	psm2_amarg_t args[3];
 	int error;
 
-	if (!av->conn_info[trx_ctxt->id].epaddrs[idx]) {
+	if (!av->conn_info[trx_ctxt->id].epaddrs[idx])
 		psmx2_epid_to_epaddr(trx_ctxt, av->table[idx].epid,
 				     &av->conn_info[trx_ctxt->id].epaddrs[idx]);
-		assert(av->conn_info[trx_ctxt->id].epaddrs[idx]);
-	}
 
 	psmx2_am_init(trx_ctxt); /* check AM handler installation */
 
@@ -350,9 +352,12 @@ int psmx2_av_query_sep(struct psmx2_fid_av *av,
 	args[0].u32w1 = av->table[idx].sep_id;
 	args[1].u64 = (uint64_t)(uintptr_t)&av->sep_info[idx];
 	args[2].u64 = (uint64_t)(uintptr_t)&status;
-	psm2_am_request_short(av->conn_info[trx_ctxt->id].epaddrs[idx],
-			      PSMX2_AM_SEP_HANDLER, args, 3, NULL,
-			      0, 0, NULL, NULL);
+	error = psm2_am_request_short(av->conn_info[trx_ctxt->id].epaddrs[idx],
+				      PSMX2_AM_SEP_HANDLER, args, 3, NULL,
+				      0, 0, NULL, NULL);
+
+	if (error)
+		return error;
 
 	/*
 	 * make sure AM is progressed promptly. don't call
@@ -821,11 +826,13 @@ STATIC int psmx2_av_map_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr,
 	return 0;
 }
 
-fi_addr_t psmx2_av_translate_source(struct psmx2_fid_av *av, psm2_epaddr_t source)
+fi_addr_t psmx2_av_translate_source(struct psmx2_fid_av *av,
+				    psm2_epaddr_t source, int source_sep_id)
 {
 	psm2_epid_t epid;
 	fi_addr_t ret;
 	int i, j, found;
+	int ep_type = source_sep_id ? PSMX2_EP_SCALABLE : PSMX2_EP_REGULAR;
 
 	if (av->type == FI_AV_MAP)
 		return (fi_addr_t) source;
@@ -841,11 +848,22 @@ fi_addr_t psmx2_av_translate_source(struct psmx2_fid_av *av, psm2_epaddr_t sourc
 			continue;
 
 		if (av->table[i].type == PSMX2_EP_REGULAR) {
+			if (ep_type == PSMX2_EP_SCALABLE)
+				continue;
 			if (av->table[i].epid == epid) {
 				ret = (fi_addr_t)i;
 				found = 1;
 			}
 		} else {
+			/*
+			 * scalable endpoint must match sep_id exactly.
+			 * regular endpoint can match a context of any
+			 * scalable endpoint.
+			 */
+			if (ep_type == PSMX2_EP_SCALABLE &&
+			    av->table[i].sep_id != source_sep_id)
+				continue;
+
 			if (!av->sep_info[i].epids) {
 				for (j = 0; j < av->max_trx_ctxt; j++) {
 					if (av->conn_info[j].trx_ctxt)
@@ -857,6 +875,7 @@ fi_addr_t psmx2_av_translate_source(struct psmx2_fid_av *av, psm2_epaddr_t sourc
 				if (!av->sep_info[i].epids)
 					continue;
 			}
+
 			for (j=0; j<av->sep_info[i].ctxt_cnt; j++) {
 				if (av->sep_info[i].epids[j] == epid) {
 					ret = fi_rx_addr((fi_addr_t)i, j,
diff --git a/deps/libfabric/prov/psm2/src/psmx2_cq.c b/deps/libfabric/prov/psm2/src/psmx2_cq.c
index bc3e25e1fe22633d1119ed7d0d14c3c2bf5f9099..b9a01c16ffb77161f3a8889b4528414784ffce1e 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_cq.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_cq.c
@@ -248,10 +248,12 @@ static inline int psmx2_cq_any_complete(struct psmx2_fid_cq *poll_cq,
 
 	if (is_recv) {
 		psm2_epaddr_t source = PSMX2_STATUS_PEER(status);
+		int source_sep_id = (flags & FI_REMOTE_CQ_DATA) ? 0 : data;
 
 		if (event == event_in) {
 			if (src_addr) {
-				src_addr[0] = psmx2_av_translate_source(av, source);
+				src_addr[0] = psmx2_av_translate_source(av, source,
+									source_sep_id);
 				if (src_addr[0] == FI_ADDR_NOTAVAIL) {
 					*event_saved = 0;
 					event = psmx2_cq_alloc_event(comp_cq);
@@ -264,16 +266,21 @@ static inline int psmx2_cq_any_complete(struct psmx2_fid_cq *poll_cq,
 					event->error = !!event->cqe.err.err;
 					if (av->addr_format == FI_ADDR_STR) {
 						event->cqe.err.err_data_size = PSMX2_ERR_DATA_SIZE;
-						psmx2_get_source_string_name(source, (void *)&comp_cq->error_data,
-										 &event->cqe.err.err_data_size);
+						psmx2_get_source_string_name(
+							source, source_sep_id,
+							(void *)&comp_cq->error_data,
+							&event->cqe.err.err_data_size);
 					} else {
-						psmx2_get_source_name(source, (void *)&comp_cq->error_data);
+						psmx2_get_source_name(
+							source, source_sep_id,
+							(void *)&comp_cq->error_data);
 						event->cqe.err.err_data_size = sizeof(struct psmx2_ep_name);
 					}
 				}
 			}
 		} else {
 			event->source_is_valid = 1;
+			event->source_sep_id = source_sep_id;
 			event->source = source;
 			event->source_av = av;
 		}
@@ -433,11 +440,9 @@ psmx2_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry
 		if (ep->recv_cq) {
 			op_context = fi_context;
 			buf = PSMX2_CTXT_USER(fi_context);
-			data = 0;
-			if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req)))) {
+			data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req));
+			if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req))))
 				flags |= FI_REMOTE_CQ_DATA;
-				data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req));
-			}
 			err = psmx2_cq_rx_complete(
 					status_data->poll_cq, ep->recv_cq, ep->av,
 					req, op_context, buf, flags, data,
@@ -457,11 +462,9 @@ psmx2_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry
 		if (ep->recv_cq) {
 			op_context = fi_context;
 			buf = PSMX2_CTXT_USER(fi_context);
-			data = 0;
-			if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req)))) {
+			data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req));
+			if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req))))
 				flags |= FI_REMOTE_CQ_DATA;
-				data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req));
-			}
 			err = psmx2_cq_rx_complete(
 					status_data->poll_cq, ep->recv_cq, ep->av,
 					req, op_context, buf, flags, data,
@@ -481,11 +484,9 @@ psmx2_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry
 		}
 		PSMX2_EP_PUT_OP_CONTEXT(ep, fi_context);
 		if (OFI_UNLIKELY(ep->recv_cq && PSMX2_STATUS_ERROR(req))) {
-			data = 0;
-			if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req)))) {
+			data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req));
+			if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req))))
 				flags |= FI_REMOTE_CQ_DATA;
-				data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req));
-			}
 			err = psmx2_cq_rx_complete(
 					status_data->poll_cq, ep->recv_cq, ep->av,
 					req, NULL, NULL, flags, data,
@@ -505,9 +506,12 @@ psmx2_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry
 		}
 		PSMX2_EP_PUT_OP_CONTEXT(ep, fi_context);
 		if (OFI_UNLIKELY(ep->recv_cq && PSMX2_STATUS_ERROR(req))) {
+			data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req));
+			if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req))))
+				flags |= FI_REMOTE_CQ_DATA;
 			err = psmx2_cq_rx_complete(
 					status_data->poll_cq, ep->recv_cq, ep->av,
-					req, NULL, NULL, flags, 0,
+					req, NULL, NULL, flags, data,
 					entry, status_data->src_addr, &event_saved);
 			if (OFI_UNLIKELY(err))
 				return err;
@@ -619,11 +623,9 @@ psmx2_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry
 		if (ep->recv_cq) {
 			op_context = fi_context;
 			buf = multi_recv_req->buf + multi_recv_req->offset;
-			data = 0;
-			if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req)))) {
+			data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req));
+			if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req))))
 				flags |= FI_REMOTE_CQ_DATA;
-				data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req));
-			}
 			if (multi_recv_req->offset + PSMX2_STATUS_RCVLEN(req) +
 				multi_recv_req->min_buf_size > multi_recv_req->len)
 				flags |= FI_MULTI_RECV;	/* buffer used up */
@@ -955,10 +957,12 @@ static inline int psmx2_cq_any_complete(struct psmx2_fid_cq *poll_cq,
 
 	if (is_recv) {
 		psm2_epaddr_t source = PSMX2_STATUS_PEER(status);
+		int source_sep_id = (flags & FI_REMOTE_CQ_DATA) ? 0 : data;
 
 		if (event == event_in) {
 			if (src_addr) {
-				src_addr[*read_count] = psmx2_av_translate_source(av, source);
+				src_addr[*read_count] = 
+					psmx2_av_translate_source(av, source, source_sep_id);
 				if (src_addr[*read_count] == FI_ADDR_NOTAVAIL) {
 					event = psmx2_cq_alloc_event(comp_cq);
 					if (!event)
@@ -970,10 +974,14 @@ static inline int psmx2_cq_any_complete(struct psmx2_fid_cq *poll_cq,
 					event->error = !!event->cqe.err.err;
 					if (av->addr_format == FI_ADDR_STR) {
 						event->cqe.err.err_data_size = PSMX2_ERR_DATA_SIZE;
-						psmx2_get_source_string_name(source, (void *)&comp_cq->error_data,
-									     &event->cqe.err.err_data_size);
+						psmx2_get_source_string_name(
+							source, source_sep_id,
+							(void *)&comp_cq->error_data,
+							&event->cqe.err.err_data_size);
 					} else {
-						psmx2_get_source_name(source, (void *)&comp_cq->error_data);
+						psmx2_get_source_name(
+							source, source_sep_id,
+							(void *)&comp_cq->error_data);
 						event->cqe.err.err_data_size = sizeof(struct psmx2_ep_name);
 					}
 
@@ -982,6 +990,7 @@ static inline int psmx2_cq_any_complete(struct psmx2_fid_cq *poll_cq,
 			}
 		} else {
 			event->source_is_valid = 1;
+			event->source_sep_id = source_sep_id;
 			event->source = source;
 			event->source_av = av;
 		}
@@ -1136,12 +1145,9 @@ int psmx2_cq_poll_mq(struct psmx2_fid_cq *cq,
 					op_context = fi_context;
 					buf = PSMX2_CTXT_USER(fi_context);
 					flags = psmx2_comp_flags[context_type];
-					if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status)))) {
+					data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status));
+					if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status))))
 						flags |= FI_REMOTE_CQ_DATA;
-						data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status));
-					} else {
-						data = 0;
-					}
 					err = psmx2_cq_rx_complete(
 							cq, ep->recv_cq, ep->av,
 							status, op_context, buf, flags, data,
@@ -1162,12 +1168,9 @@ int psmx2_cq_poll_mq(struct psmx2_fid_cq *cq,
 					op_context = fi_context;
 					buf = PSMX2_CTXT_USER(fi_context);
 					flags = psmx2_comp_flags[context_type];
-					if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status)))) {
+					data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status));
+					if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status))))
 						flags |= FI_REMOTE_CQ_DATA;
-						data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status));
-					} else {
-						data = 0;
-					}
 					err = psmx2_cq_rx_complete(
 							cq, ep->recv_cq, ep->av,
 							status, op_context, buf, flags, data,
@@ -1191,12 +1194,9 @@ int psmx2_cq_poll_mq(struct psmx2_fid_cq *cq,
 					op_context = NULL;
 					buf = NULL;
 					flags = psmx2_comp_flags[context_type];
-					if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status)))) {
+					data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status));
+					if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status))))
 						flags |= FI_REMOTE_CQ_DATA;
-						data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status));
-					} else {
-						data = 0;
-					}
 					err = psmx2_cq_rx_complete(
 							cq, ep->recv_cq, ep->av,
 							status, op_context, buf, flags, data,
@@ -1220,9 +1220,12 @@ int psmx2_cq_poll_mq(struct psmx2_fid_cq *cq,
 					op_context = NULL;
 					buf = NULL;
 					flags = psmx2_comp_flags[context_type];
+					data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status));
+					if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status))))
+						flags |= FI_REMOTE_CQ_DATA;
 					err = psmx2_cq_rx_complete(
 							cq, ep->recv_cq, ep->av,
-							status, op_context, buf, flags, 0,
+							status, op_context, buf, flags, data,
 							event_in, count, &read_count,
 							&read_more, src_addr);
 					if (err)
@@ -1347,12 +1350,9 @@ int psmx2_cq_poll_mq(struct psmx2_fid_cq *cq,
 					op_context = fi_context;
 					buf = multi_recv_req->buf + multi_recv_req->offset;
 					flags = psmx2_comp_flags[context_type];
-					if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status)))) {
+					data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status));
+					if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status))))
 						flags |= FI_REMOTE_CQ_DATA;
-						data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status));
-					} else {
-						data = 0;
-					}
 					if (multi_recv_req->offset + PSMX2_STATUS_RCVLEN(status) +
 					    multi_recv_req->min_buf_size > multi_recv_req->len)
 						flags |= FI_MULTI_RECV;	/* buffer used up */
@@ -1609,16 +1609,21 @@ STATIC ssize_t psmx2_cq_readfrom(struct fid_cq *cq, void *buf, size_t count,
 		if (event) {
 			if (!event->error) {
 				if (src_addr && event->source_is_valid) {
-					source = psmx2_av_translate_source(event->source_av,
-									   event->source);
+					source = psmx2_av_translate_source(
+							event->source_av, event->source,
+							event->source_sep_id);
 					if (source == FI_ADDR_NOTAVAIL) {
 						if (cq_priv->domain->addr_format == FI_ADDR_STR) {
 							event->cqe.err.err_data_size = PSMX2_ERR_DATA_SIZE;
-							psmx2_get_source_string_name(event->source,
-										     (void *)&cq_priv->error_data,
-										     &event->cqe.err.err_data_size);
+							psmx2_get_source_string_name(
+								event->source, event->source_sep_id,
+								(void *)&cq_priv->error_data,
+								&event->cqe.err.err_data_size);
 						} else {
-							psmx2_get_source_name(event->source, (void *)&cq_priv->error_data);
+							psmx2_get_source_name(
+								event->source,
+								event->source_sep_id,
+								(void *)&cq_priv->error_data);
 							event->cqe.err.err_data_size = sizeof(struct psmx2_ep_name);
 						}
 						event->cqe.err.err_data = &cq_priv->error_data;
diff --git a/deps/libfabric/prov/psm2/src/psmx2_ep.c b/deps/libfabric/prov/psm2/src/psmx2_ep.c
index 65697965de8035d0bf2705982654197f46193a94..4b32b7422a7b492c8f7a0ae64623c359b011886e 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_ep.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_ep.c
@@ -1032,6 +1032,8 @@ int psmx2_sep_open(struct fid_domain *domain, struct fi_info *info,
 				   ((uintptr_t)sep_priv & 0xFFFF);
 
 	sep_priv->id = ofi_atomic_inc32(&domain_priv->sep_cnt);
+	for (i = 0; i < ctxt_cnt; i++)
+		sep_priv->ctxts[i].ep->sep_id = sep_priv->id;
 
 	domain_priv->sep_lock_fn(&domain_priv->sep_lock, 1);
 	dlist_insert_before(&sep_priv->entry, &domain_priv->sep_list);
diff --git a/deps/libfabric/prov/psm2/src/psmx2_init.c b/deps/libfabric/prov/psm2/src/psmx2_init.c
index 64d26865a1a81d978405db10f05d1b1580c88bf7..72982d73064ff17728d25a1afd88ecf8ec5840a0 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_init.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_init.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2019 Intel Corporation. All rights reserved.
+ * Copyright (c) 2013-2020 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -430,7 +430,7 @@ static void psmx2_update_hfi_nic_info(struct fi_info *info)
 	char buffer[80];
 	char *s;
 	ssize_t n;
-	int a, b, c, d;
+	unsigned int a, b, c, d;
 	int unit;
 
 	for ( ; info; info = info->next) {
@@ -477,10 +477,10 @@ static void psmx2_update_hfi_nic_info(struct fi_info *info)
 		}
 
 		info->nic->bus_attr->bus_type = FI_BUS_PCI;
-		info->nic->bus_attr->attr.pci.domain_id = a;
-		info->nic->bus_attr->attr.pci.bus_id = b;
-		info->nic->bus_attr->attr.pci.device_id = c;
-		info->nic->bus_attr->attr.pci.function_id = d;
+		info->nic->bus_attr->attr.pci.domain_id = (uint16_t) a;
+		info->nic->bus_attr->attr.pci.bus_id =  (uint8_t) b;
+		info->nic->bus_attr->attr.pci.device_id = (uint8_t) c;
+		info->nic->bus_attr->attr.pci.function_id = (uint8_t) d;
 	}
 }
 
@@ -559,7 +559,7 @@ static int psmx2_getinfo(uint32_t api_version, const char *node,
 
 	/* Check that the src address contains valid unit */
 	if (src_addr->unit != PSMX2_DEFAULT_UNIT) {
-		if (src_addr->unit < 0 || src_addr->unit > PSMX2_MAX_UNITS) {
+		if (src_addr->unit < 0 || src_addr->unit >= PSMX2_MAX_UNITS) {
 			FI_INFO(&psmx2_prov, FI_LOG_CORE,
 				"invalid source address: unit %d out of range\n", src_addr->unit);
 			goto err_out;
diff --git a/deps/libfabric/prov/psm2/src/psmx2_msg.c b/deps/libfabric/prov/psm2/src/psmx2_msg.c
index 50a03f498380f15a301953409bc5c27d17e05efb..2384ab23bd13bcb2d93115958a8622df9d060ff0 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_msg.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_msg.c
@@ -211,7 +211,10 @@ ssize_t psmx2_send_generic(struct fid_ep *ep, const void *buf, size_t len,
 	assert(av);
 	psm2_epaddr = psmx2_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
 
-	PSMX2_SET_TAG(psm2_tag, 0, data, PSMX2_TYPE_MSG | PSMX2_IMM_BIT_SET(have_data));
+	if (have_data)
+		PSMX2_SET_TAG(psm2_tag, 0, data, PSMX2_TYPE_MSG | PSMX2_IMM_BIT);
+	else
+		PSMX2_SET_TAG(psm2_tag, 0, ep_priv->sep_id, PSMX2_TYPE_MSG);
 
 	if ((flags & PSMX2_NO_COMPLETION) ||
 	    (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)))
@@ -353,10 +356,12 @@ ssize_t psmx2_sendv_generic(struct fid_ep *ep, const struct iovec *iov,
 	assert(av);
 	psm2_epaddr = psmx2_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
 
-	if (flags & FI_REMOTE_CQ_DATA)
+	if (flags & FI_REMOTE_CQ_DATA) {
 		msg_flags |= PSMX2_IMM_BIT;
-
-	PSMX2_SET_TAG(psm2_tag, 0ULL, data, msg_flags);
+		PSMX2_SET_TAG(psm2_tag, 0ULL, data, msg_flags);
+	} else {
+		PSMX2_SET_TAG(psm2_tag, 0ULL, ep_priv->sep_id, msg_flags);
+	}
 
 	if ((flags & PSMX2_NO_COMPLETION) ||
 	    (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)))
diff --git a/deps/libfabric/prov/psm2/src/psmx2_rma.c b/deps/libfabric/prov/psm2/src/psmx2_rma.c
index 84928fb4708df6cb1b0a0ea102da765cc657d779..db7873e0bae651326b97eec67fe37030300fee5d 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_rma.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_rma.c
@@ -577,6 +577,7 @@ static ssize_t psmx2_rma_self(int am_cmd,
 void psmx2_am_ack_rma(struct psmx2_am_request *req)
 {
 	psm2_amarg_t args[8];
+	int err;
 
 	if ((req->op & PSMX2_AM_OP_MASK) != PSMX2_AM_REQ_WRITE_LONG)
 		return;
@@ -585,9 +586,12 @@ void psmx2_am_ack_rma(struct psmx2_am_request *req)
 	args[0].u32w1 = req->error;
 	args[1].u64 = (uint64_t)(uintptr_t)req->write.context;
 
-	psm2_am_request_short(req->write.peer_addr,
-			      PSMX2_AM_RMA_HANDLER, args, 2, NULL, 0,
-			      PSM2_AM_FLAG_NOREPLY, NULL, NULL);
+	err = psm2_am_request_short(req->write.peer_addr,
+				    PSMX2_AM_RMA_HANDLER, args, 2, NULL, 0,
+				    PSM2_AM_FLAG_NOREPLY, NULL, NULL);
+	if (err)
+		FI_INFO(&psmx2_prov, FI_LOG_EP_DATA,
+			"failed to send am_ack: err %d.\n", err);
 }
 
 #if !HAVE_PSM2_MQ_FP_MSG
@@ -636,6 +640,8 @@ ssize_t psmx2_read_generic(struct fid_ep *ep, void *buf, size_t len,
 	psm2_epid_t psm2_epid;
 	psm2_mq_req_t psm2_req;
 	psm2_mq_tag_t psm2_tag, psm2_tagsel;
+	size_t req_refcnt = 0;
+	int err;
 
 	ep_priv = container_of(ep, struct psmx2_fid_ep, ep);
 
@@ -684,17 +690,25 @@ ssize_t psmx2_read_generic(struct fid_ep *ep, void *buf, size_t len,
 	if (psmx2_env.tagged_rma && len > chunk_size) {
 		PSMX2_SET_TAG(psm2_tag, (uint64_t)req, 0, PSMX2_RMA_TYPE_READ);
 		PSMX2_SET_MASK(psm2_tagsel, PSMX2_MATCH_ALL, PSMX2_RMA_TYPE_MASK);
-		psm2_mq_irecv2(ep_priv->tx->psm2_mq, psm2_epaddr,
-			       &psm2_tag, &psm2_tagsel, 0, buf, len,
-			       (void *)&req->fi_context, &psm2_req);
+		err = psm2_mq_irecv2(ep_priv->tx->psm2_mq, psm2_epaddr,
+				     &psm2_tag, &psm2_tagsel, 0, buf, len,
+				     (void *)&req->fi_context, &psm2_req);
+		if (err) {
+			psmx2_am_request_free(ep_priv->tx, req);
+			return psmx2_errno(err);
+		}
 
 		PSMX2_AM_SET_OP(args[0].u32w0, PSMX2_AM_REQ_READ_LONG);
 		args[0].u32w1 = len;
 		args[1].u64 = (uint64_t)req;
 		args[2].u64 = addr;
 		args[3].u64 = key;
-		psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
-				      args, 4, NULL, 0, 0, NULL, NULL);
+		err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
+					    args, 4, NULL, 0, 0, NULL, NULL);
+		if (err) {
+			/* req in use, don't free */
+			return psmx2_errno(err);
+		}
 		psmx2_am_poll(ep_priv->tx);
 		return 0;
 	}
@@ -706,20 +720,31 @@ ssize_t psmx2_read_generic(struct fid_ep *ep, void *buf, size_t len,
 		args[0].u32w1 = chunk_size;
 		args[2].u64 = addr;
 		args[4].u64 = offset;
-		psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
-				      args, 5, NULL, 0, 0, NULL, NULL);
+		err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
+					    args, 5, NULL, 0, 0, NULL, NULL);
+		if (err) {
+			if (!req_refcnt)
+				psmx2_am_request_free(ep_priv->tx, req);
+			return psmx2_errno(err);
+		}
 		psmx2_am_poll(ep_priv->tx);
 		addr += chunk_size;
 		len -= chunk_size;
 		offset += chunk_size;
+		req_refcnt++;
 	}
 
 	PSMX2_AM_SET_FLAG(args[0].u32w0, PSMX2_AM_EOM);
 	args[0].u32w1 = len;
 	args[2].u64 = addr;
 	args[4].u64 = offset;
-	psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
-			      args, 5, NULL, 0, 0, NULL, NULL);
+	err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
+				    args, 5, NULL, 0, 0, NULL, NULL);
+	if (err) {
+		if (!req_refcnt)
+			psmx2_am_request_free(ep_priv->tx, req);
+		return psmx2_errno(err);
+	}
 	psmx2_am_poll(ep_priv->tx);
 	return 0;
 }
@@ -742,6 +767,8 @@ ssize_t psmx2_readv_generic(struct fid_ep *ep, const struct iovec *iov,
 	size_t total_len, long_len = 0, short_len;
 	void *long_buf = NULL;
 	int i;
+	size_t req_refcnt = 0;
+	int err;
 
 	ep_priv = container_of(ep, struct psmx2_fid_ep, ep);
 
@@ -819,39 +846,68 @@ ssize_t psmx2_readv_generic(struct fid_ep *ep, const struct iovec *iov,
 		args[0].u32w1 = chunk_size;
 		args[2].u64 = addr;
 		args[4].u64 = offset;
-		psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
-				      args, 5, NULL, 0, 0, NULL, NULL);
+		err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
+					    args, 5, NULL, 0, 0, NULL, NULL);
+		if (err) {
+			if (!req_refcnt) {
+				free(req->tmpbuf);
+				psmx2_am_request_free(ep_priv->tx, req);
+			}
+			return psmx2_errno(err);
+		}
 		psmx2_am_poll(ep_priv->tx);
 		addr += chunk_size;
 		short_len -= chunk_size;
 		offset += chunk_size;
+		req_refcnt++;
 	}
 
-	if (!long_len)
-		PSMX2_AM_SET_FLAG(args[0].u32w0, PSMX2_AM_EOM);
-	args[0].u32w1 = short_len;
-	args[2].u64 = addr;
-	args[4].u64 = offset;
-	psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
-			      args, 5, NULL, 0, 0, NULL, NULL);
-	psmx2_am_poll(ep_priv->tx);
+	if (short_len) {
+		if (!long_len)
+			PSMX2_AM_SET_FLAG(args[0].u32w0, PSMX2_AM_EOM);
+		args[0].u32w1 = short_len;
+		args[2].u64 = addr;
+		args[4].u64 = offset;
+		err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
+					    args, 5, NULL, 0, 0, NULL, NULL);
+		if (err) {
+			if (!req_refcnt) {
+				free(req->tmpbuf);
+				psmx2_am_request_free(ep_priv->tx, req);
+			}
+			return psmx2_errno(err);
+		}
+		psmx2_am_poll(ep_priv->tx);
+		req_refcnt++;
+	}
 
 	/* Use the long protocol for the last segment */
 	if (long_len) {
 		PSMX2_SET_TAG(psm2_tag, (uint64_t)req, 0, PSMX2_RMA_TYPE_READ);
 		PSMX2_SET_MASK(psm2_tagsel, PSMX2_MATCH_ALL, PSMX2_RMA_TYPE_MASK);
-		psm2_mq_irecv2(ep_priv->tx->psm2_mq, psm2_epaddr,
-			       &psm2_tag, &psm2_tagsel, 0,
-			       long_buf, long_len,
-			       (void *)&req->fi_context, &psm2_req);
+		err = psm2_mq_irecv2(ep_priv->tx->psm2_mq, psm2_epaddr,
+				     &psm2_tag, &psm2_tagsel, 0,
+				     long_buf, long_len,
+				     (void *)&req->fi_context, &psm2_req);
+		if (err) {
+			if (!req_refcnt) {
+				free(req->tmpbuf);
+				psmx2_am_request_free(ep_priv->tx, req);
+			}
+			return psmx2_errno(err);
+		}
 
 		PSMX2_AM_SET_OP(args[0].u32w0, PSMX2_AM_REQ_READ_LONG);
 		args[0].u32w1 = long_len;
 		args[1].u64 = (uint64_t)req;
 		args[2].u64 = addr + short_len;
 		args[3].u64 = key;
-		psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
-				      args, 4, NULL, 0, 0, NULL, NULL);
+		err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
+					    args, 4, NULL, 0, 0, NULL, NULL);
+		if (err) {
+			/* req in use, don't free */
+			return psmx2_errno(err);
+		}
 		psmx2_am_poll(ep_priv->tx);
 	}
 
@@ -937,6 +993,8 @@ ssize_t psmx2_write_generic(struct fid_ep *ep, const void *buf, size_t len,
 	psm2_mq_tag_t psm2_tag;
 	void *psm2_context;
 	int no_event;
+	size_t req_refcnt = 0;
+	int err;
 
 	ep_priv = container_of(ep, struct psmx2_fid_ep, ep);
 
@@ -1022,13 +1080,22 @@ ssize_t psmx2_write_generic(struct fid_ep *ep, const void *buf, size_t len,
 			psm2_context = (void *)&req->fi_context;
 		}
 
-		psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args,
-				      nargs, NULL, 0, am_flags, NULL, NULL);
+		err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
+					    args, nargs, NULL, 0, am_flags,
+					    NULL, NULL);
+		if (err) {
+			free(req->tmpbuf);
+			psmx2_am_request_free(ep_priv->tx, req);
+			return psmx2_errno(err);
+		}
 		psmx2_am_poll(ep_priv->tx);
 
-		psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, 0,
-			       &psm2_tag, buf, len, psm2_context, &psm2_req);
-
+		err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, 0,
+				     &psm2_tag, buf, len, psm2_context, &psm2_req);
+		if (err) {
+			/* req in use, don't free */
+			return psmx2_errno(err);
+		}
 		return 0;
 	}
 
@@ -1039,13 +1106,21 @@ ssize_t psmx2_write_generic(struct fid_ep *ep, const void *buf, size_t len,
 		args[1].u64 = (uint64_t)(uintptr_t)req;
 		args[2].u64 = addr;
 		args[3].u64 = key;
-		psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args,
-				      nargs, (void *)buf, chunk_size, am_flags,
-				      NULL, NULL);
+		err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
+					    args, nargs, (void *)buf,
+					    chunk_size, am_flags, NULL, NULL);
+		if (err) {
+			if (!req_refcnt) {
+				free(req->tmpbuf);
+				psmx2_am_request_free(ep_priv->tx, req);
+			}
+			return psmx2_errno(err);
+		}
 		psmx2_am_poll(ep_priv->tx);
 		buf = (const uint8_t *)buf + chunk_size;
 		addr += chunk_size;
 		len -= chunk_size;
+		req_refcnt++;
 	}
 
 	args[0].u32w1 = len;
@@ -1059,8 +1134,16 @@ ssize_t psmx2_write_generic(struct fid_ep *ep, const void *buf, size_t len,
 	} else {
 		PSMX2_AM_SET_FLAG(args[0].u32w0, PSMX2_AM_EOM);
 	}
-	psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args, nargs,
-			      (void *)buf, len, am_flags, NULL, NULL);
+	err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
+				    args, nargs, (void *)buf, len, am_flags,
+				    NULL, NULL);
+	if (err) {
+		if (!req_refcnt) {
+			free(req->tmpbuf);
+			psmx2_am_request_free(ep_priv->tx, req);
+		}
+		return psmx2_errno(err);
+	}
 	psmx2_am_poll(ep_priv->tx);
 	return 0;
 }
@@ -1086,6 +1169,8 @@ ssize_t psmx2_writev_generic(struct fid_ep *ep, const struct iovec *iov,
 	size_t total_len, len, len_sent;
 	uint8_t *buf, *p;
 	int i;
+	size_t req_refcnt = 0;
+	int err;
 
 	ep_priv = container_of(ep, struct psmx2_fid_ep, ep);
 
@@ -1162,8 +1247,14 @@ ssize_t psmx2_writev_generic(struct fid_ep *ep, const struct iovec *iov,
 		} else {
 			PSMX2_AM_SET_FLAG(args[0].u32w0, PSMX2_AM_EOM);
 		}
-		psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args, nargs,
-				      (void *)buf, len, am_flags, NULL, NULL);
+		err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
+					    args, nargs, (void *)buf, len,
+					    am_flags, NULL, NULL);
+		if (err) {
+			free(req->tmpbuf);
+			psmx2_am_request_free(ep_priv->tx, req);
+			return psmx2_errno(err);
+		}
 		psmx2_am_poll(ep_priv->tx);
 		return 0;
 	}
@@ -1220,14 +1311,24 @@ ssize_t psmx2_writev_generic(struct fid_ep *ep, const struct iovec *iov,
 				psm2_context = (void *)&req->fi_context;
 			}
 
-			psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args,
-					      nargs, NULL, 0, am_flags, NULL, NULL);
+			err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
+						    args, nargs, NULL, 0, am_flags,
+						    NULL, NULL);
+			if (err) {
+				if (!req_refcnt)
+					psmx2_am_request_free(ep_priv->tx, req);
+				return psmx2_errno(err);
+			}
 			psmx2_am_poll(ep_priv->tx);
 
-			psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, 0,
-				       &psm2_tag, iov[i].iov_base, iov[i].iov_len,
-				       psm2_context, &psm2_req);
-
+			err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr,
+					     0, &psm2_tag, iov[i].iov_base,
+					     iov[i].iov_len, psm2_context,
+					     &psm2_req);
+			if (err) {
+				/* req in use, don't free */
+				return psmx2_errno(err);
+			}
 			return 0;
 		}
 
@@ -1241,14 +1342,21 @@ ssize_t psmx2_writev_generic(struct fid_ep *ep, const struct iovec *iov,
 			args[1].u64 = (uint64_t)(uintptr_t)req;
 			args[2].u64 = addr;
 			args[3].u64 = key;
-			psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args,
-					      nargs, (void *)buf, chunk_size, am_flags,
-					      NULL, NULL);
+			err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
+						    args, nargs, (void *)buf,
+						    chunk_size, am_flags,
+						    NULL, NULL);
+			if (err) {
+				if (!req_refcnt)
+					psmx2_am_request_free(ep_priv->tx, req);
+				return psmx2_errno(err);
+			}
 			psmx2_am_poll(ep_priv->tx);
 			buf += chunk_size;
 			addr += chunk_size;
 			len -= chunk_size;
 			len_sent += chunk_size;
+			req_refcnt++;
 		}
 
 		args[0].u32w1 = len;
@@ -1264,12 +1372,19 @@ ssize_t psmx2_writev_generic(struct fid_ep *ep, const struct iovec *iov,
 				PSMX2_AM_SET_FLAG(args[0].u32w0, PSMX2_AM_EOM);
 			}
 		}
-		psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args, nargs,
-				      (void *)buf, len, am_flags, NULL, NULL);
+		err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER,
+					    args, nargs, (void *)buf, len,
+					    am_flags, NULL, NULL);
+		if (err) {
+			if (!req_refcnt)
+				psmx2_am_request_free(ep_priv->tx, req);
+			return psmx2_errno(err);
+		}
 		psmx2_am_poll(ep_priv->tx);
 
 		addr += len;
 		len_sent += len;
+		req_refcnt++;
 	}
 
 	return 0;
diff --git a/deps/libfabric/prov/psm2/src/psmx2_tagged.c b/deps/libfabric/prov/psm2/src/psmx2_tagged.c
index 65fc795ec90f330a5c117dcf12a50de838b5fe58..bb99b6a3714b3353c0172e01fc76f7121390143e 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_tagged.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_tagged.c
@@ -535,8 +535,12 @@ ssize_t psmx2_tagged_send_generic(struct fid_ep *ep,
 	assert(av);
 	psm2_epaddr = psmx2_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
 
-	PSMX2_SET_TAG(psm2_tag, tag, (uint32_t)data,
-		      PSMX2_TYPE_TAGGED | PSMX2_IMM_BIT_SET(have_data));
+	if (have_data)
+		PSMX2_SET_TAG(psm2_tag, tag, (uint32_t)data,
+			      PSMX2_TYPE_TAGGED | PSMX2_IMM_BIT);
+	else
+		PSMX2_SET_TAG(psm2_tag, tag, (uint32_t)ep_priv->sep_id,
+			      PSMX2_TYPE_TAGGED);
 
 	if ((flags & PSMX2_NO_COMPLETION) ||
 	    (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)))
@@ -602,7 +606,7 @@ psmx2_tagged_send_specialized(struct fid_ep *ep, const void *buf,
 			      fi_addr_t dest_addr, uint64_t tag,
 			      void *context,
 			      int enable_completion, int av_map,
-			      int has_data, uint64_t data)
+			      int have_data, uint64_t data)
 {
 	struct psmx2_fid_ep *ep_priv;
 	psm2_epaddr_t psm2_epaddr;
@@ -622,10 +626,10 @@ psmx2_tagged_send_specialized(struct fid_ep *ep, const void *buf,
 		psm2_epaddr = psmx2_av_translate_addr(ep_priv->av, ep_priv->tx, dest_addr, FI_AV_TABLE);
 	}
 
-	if (has_data)
+	if (have_data)
 		PSMX2_SET_TAG(psm2_tag, tag, data, PSMX2_TYPE_TAGGED | PSMX2_IMM_BIT);
 	else
-		PSMX2_SET_TAG(psm2_tag, tag, 0, PSMX2_TYPE_TAGGED);
+		PSMX2_SET_TAG(psm2_tag, tag, ep_priv->sep_id, PSMX2_TYPE_TAGGED);
 
 	if (enable_completion) {
 		fi_context = context;
@@ -734,7 +738,7 @@ static inline ssize_t
 psmx2_tagged_inject_specialized(struct fid_ep *ep, const void *buf,
 				size_t len, fi_addr_t dest_addr,
 				uint64_t tag, int av_map,
-				int has_data, uint64_t data)
+				int have_data, uint64_t data)
 {
 	struct psmx2_fid_ep *ep_priv;
 	psm2_epaddr_t psm2_epaddr;
@@ -755,10 +759,10 @@ psmx2_tagged_inject_specialized(struct fid_ep *ep, const void *buf,
 		psm2_epaddr = psmx2_av_translate_addr(ep_priv->av, ep_priv->tx, dest_addr, FI_AV_TABLE);
 	}
 
-	if (has_data)
+	if (have_data)
 		PSMX2_SET_TAG(psm2_tag, tag, data, PSMX2_TYPE_TAGGED | PSMX2_IMM_BIT);
 	else
-		PSMX2_SET_TAG(psm2_tag, tag, 0, PSMX2_TYPE_TAGGED);
+		PSMX2_SET_TAG(psm2_tag, tag, ep_priv->sep_id, PSMX2_TYPE_TAGGED);
 
 	err = psm2_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, 0,
 			    &psm2_tag, buf, len);
@@ -895,9 +899,10 @@ ssize_t psmx2_tagged_sendv_generic(struct fid_ep *ep,
 	assert(av);
 	psm2_epaddr = psmx2_av_translate_addr(av, ep_priv->tx, dest_addr, av->type);
 
-	PSMX2_SET_TAG(psm2_tag, tag, (uint32_t)data,
-		      msg_flags | PSMX2_IMM_BIT_SET(have_data));
-
+	if (have_data)
+		PSMX2_SET_TAG(psm2_tag, tag, (uint32_t)data, msg_flags | PSMX2_IMM_BIT);
+	else
+		PSMX2_SET_TAG(psm2_tag, tag, (uint32_t)ep_priv->sep_id, msg_flags);
 
 	if ((flags & PSMX2_NO_COMPLETION) ||
 	    (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)))
diff --git a/deps/libfabric/prov/psm2/src/psmx2_trx_ctxt.c b/deps/libfabric/prov/psm2/src/psmx2_trx_ctxt.c
index a4233b8bdf4b8e9193ca9db1bd8a2b5b99473741..f90f50f6e738173f9e37e47e40a86d90e6e1e6b7 100644
--- a/deps/libfabric/prov/psm2/src/psmx2_trx_ctxt.c
+++ b/deps/libfabric/prov/psm2/src/psmx2_trx_ctxt.c
@@ -128,6 +128,7 @@ void psmx2_trx_ctxt_disconnect_peers(struct psmx2_trx_ctxt *trx_ctxt)
 	struct psmx2_epaddr_context *peer;
 	struct dlist_entry peer_list;
 	psm2_amarg_t arg;
+	int err;
 
 	arg.u32w0 = PSMX2_AM_REQ_TRX_CTXT_DISCONNECT;
 
@@ -144,8 +145,14 @@ void psmx2_trx_ctxt_disconnect_peers(struct psmx2_trx_ctxt *trx_ctxt)
 		peer = container_of(item, struct psmx2_epaddr_context, entry);
 		if (psmx2_env.disconnect) {
 			FI_INFO(&psmx2_prov, FI_LOG_CORE, "epaddr: %p\n", peer->epaddr);
-			psm2_am_request_short(peer->epaddr, PSMX2_AM_TRX_CTXT_HANDLER,
-					      &arg, 1, NULL, 0, 0, NULL, NULL);
+			err = psm2_am_request_short(peer->epaddr,
+						    PSMX2_AM_TRX_CTXT_HANDLER,
+						    &arg, 1, NULL, 0, 0, NULL,
+						    NULL);
+			if (err)
+				FI_INFO(&psmx2_prov, FI_LOG_CORE,
+					"failed to send disconnect, err %d\n",
+					err);
 		}
 		psm2_epaddr_setctxt(peer->epaddr, NULL);
 		free(peer);
diff --git a/deps/libfabric/prov/rstream/src/rstream.h b/deps/libfabric/prov/rstream/src/rstream.h
index ddbc1fd5b67b3b3ff786b46da447140dd3d647bc..70583069e669d1c846d259f91e1d3536f3da5761 100644
--- a/deps/libfabric/prov/rstream/src/rstream.h
+++ b/deps/libfabric/prov/rstream/src/rstream.h
@@ -207,9 +207,9 @@ extern ssize_t rstream_post_cq_data_recv(struct rstream_ep *ep,
 	const struct fi_cq_data_entry *cq_entry);
 
 extern int rstream_info_to_rstream(uint32_t version, const struct fi_info *core_info,
-	struct fi_info *info);
+	const struct fi_info *base_info, struct fi_info *info);
 extern int rstream_info_to_core(uint32_t version, const struct fi_info *rstream_info,
-	struct fi_info *core_info);
+	const struct fi_info *base_info, struct fi_info *core_info);
 extern void rstream_set_info(struct fi_info *info);
 extern struct fi_ops_cm rstream_ops_cm;
 extern struct fi_ops_cm rstream_ops_pep_cm;
@@ -227,6 +227,6 @@ int rstream_ep_open(struct fid_domain *domain, struct fi_info *info,
 int rstream_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr,
 	struct fid_eq **eq, void *context);
 int rstream_info_to_core(uint32_t version, const struct fi_info *rstream_info,
-	struct fi_info *core_info);
+	const struct fi_info *base_info, struct fi_info *core_info);
 
 #endif /* _RSTREAM_H_ */
diff --git a/deps/libfabric/prov/rstream/src/rstream_domain.c b/deps/libfabric/prov/rstream/src/rstream_domain.c
index 3f109681d04e02db9264474e34a170798009266d..3b59107d771511236ef13e7c64962649aa3173f1 100644
--- a/deps/libfabric/prov/rstream/src/rstream_domain.c
+++ b/deps/libfabric/prov/rstream/src/rstream_domain.c
@@ -98,7 +98,7 @@ int rstream_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 		util_fabric.fabric_fid);
 
 	ret = ofi_get_core_info(FI_VERSION(1, 8), NULL, NULL, 0,
-		&rstream_util_prov, info, rstream_info_to_core, &cinfo);
+		&rstream_util_prov, info, NULL, rstream_info_to_core, &cinfo);
 	if (ret)
 		goto err1;
 
diff --git a/deps/libfabric/prov/rstream/src/rstream_ep.c b/deps/libfabric/prov/rstream/src/rstream_ep.c
index 361f2a866266b99a0281cca58721d7e9e3fc7909..27f5e21ebf5fa92fd87ee7c8603847a2a0e1cc8c 100644
--- a/deps/libfabric/prov/rstream/src/rstream_ep.c
+++ b/deps/libfabric/prov/rstream/src/rstream_ep.c
@@ -250,7 +250,7 @@ int rstream_ep_open(struct fid_domain *domain, struct fi_info *info,
 	if (ret)
 		goto err1;
 
-	rstream_info_to_core(FI_VERSION(1, 8), NULL, info);
+	rstream_info_to_core(FI_VERSION(1, 8), NULL, NULL, info);
 
 	if (info->handle && info->handle->fclass == FI_CLASS_PEP) {
 		rstream_pep = container_of(info->handle,
@@ -378,7 +378,7 @@ int rstream_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
 	if (!rstream_pep)
 		return -FI_ENOMEM;
 
-	rstream_info_to_core(FI_VERSION(1, 8), NULL, info);
+	rstream_info_to_core(FI_VERSION(1, 8), NULL, NULL, info);
 
 	ret = fi_passive_ep(rstream_fabric->msg_fabric, info,
 		&rstream_pep->pep_fd, NULL);
diff --git a/deps/libfabric/prov/rstream/src/rstream_init.c b/deps/libfabric/prov/rstream/src/rstream_init.c
index 3430df6e6ac4e710f9f8d8497a0d1e9012593e3b..15d09302c14577f9e4cd95135522101055b2dc3d 100644
--- a/deps/libfabric/prov/rstream/src/rstream_init.c
+++ b/deps/libfabric/prov/rstream/src/rstream_init.c
@@ -51,7 +51,7 @@ static void rstream_default_settings(struct fi_info *core_info)
 }
 
 int rstream_info_to_core(uint32_t version, const struct fi_info *irstream_info,
-	struct fi_info *core_info)
+	const struct fi_info *base_info, struct fi_info *core_info)
 {
 	core_info->ep_attr->type = FI_EP_MSG;
 	core_info->ep_attr->protocol = FI_PROTO_UNSPEC;
@@ -90,7 +90,7 @@ static void update_rstream_info(const struct fi_info *core_info)
 }
 
 int rstream_info_to_rstream(uint32_t version, const struct fi_info *core_info,
-	struct fi_info *info)
+	const struct fi_info *base_info, struct fi_info *info)
 {
 	info->caps = RSTREAM_CAPS;
 	info->mode = 0;
diff --git a/deps/libfabric/prov/rxd/src/rxd.h b/deps/libfabric/prov/rxd/src/rxd.h
index 699b3d4bacf051e2e1da66ce7fe421c5a152f5f1..d7ad6674266566cefdb4ba9b7dec867f91d74090 100644
--- a/deps/libfabric/prov/rxd/src/rxd.h
+++ b/deps/libfabric/prov/rxd/src/rxd.h
@@ -54,6 +54,7 @@
 #include <ofi_util.h>
 #include <ofi_tree.h>
 #include <ofi_atomic.h>
+#include <ofi_indexer.h>
 #include "rxd_proto.h"
 
 #ifndef _RXD_H_
@@ -71,6 +72,7 @@
 #define RXD_RX_POOL_CHUNK_CNT	1024
 #define RXD_MAX_PENDING		128
 #define RXD_MAX_PKT_RETRY	50
+#define RXD_ADDR_INVALID	0
 
 #define RXD_PKT_IN_USE		(1 << 0)
 #define RXD_PKT_ACKED		(1 << 1)
@@ -83,6 +85,8 @@
 #define RXD_INLINE		(1 << 5)
 #define RXD_MULTI_RECV		(1 << 6)
 
+#define RXD_IDX_OFFSET(x)	(x + 1)	
+
 struct rxd_env {
 	int spin_count;
 	int retry;
@@ -151,14 +155,12 @@ struct rxd_av {
 	struct util_av util_av;
 	struct fid_av *dg_av;
 	struct ofi_rbmap rbmap;
-	int fi_addr_idx;
-	int rxd_addr_idx;
 
 	int dg_av_used;
 	size_t dg_addrlen;
-
-	fi_addr_t *fi_addr_table;
-	struct rxd_addr *rxd_addr_table;
+	struct indexer fi_addr_idx;	
+	struct indexer rxdaddr_dg_idx;
+	struct index_map rxdaddr_fi_idm;
 };
 
 struct rxd_cq;
@@ -216,9 +218,14 @@ struct rxd_ep {
 	struct dlist_entry rts_sent_list;
 	struct dlist_entry ctrl_pkts;
 
-	struct rxd_peer peers[];
+	struct index_map peers_idm;
 };
+/* ensure ep lock is held before this function is called */
+static inline struct rxd_peer *rxd_peer(struct rxd_ep *ep, fi_addr_t rxd_addr)
+{
+	return ofi_idm_lookup(&ep->peers_idm, rxd_addr);
 
+}
 static inline struct rxd_domain *rxd_ep_domain(struct rxd_ep *ep)
 {
 	return container_of(ep->util_ep.domain, struct rxd_domain, util_domain);
@@ -388,7 +395,7 @@ struct rxd_match_attr {
 
 static inline int rxd_match_addr(fi_addr_t addr, fi_addr_t match_addr)
 {
-	return (addr == FI_ADDR_UNSPEC || addr == match_addr);
+	return (addr == RXD_ADDR_INVALID || addr == match_addr);
 }
 
 static inline int rxd_match_tag(uint64_t tag, uint64_t ignore, uint64_t match_tag)
@@ -397,9 +404,9 @@ static inline int rxd_match_tag(uint64_t tag, uint64_t ignore, uint64_t match_ta
 }
 
 int rxd_info_to_core(uint32_t version, const struct fi_info *rxd_info,
-		     struct fi_info *core_info);
+		     const struct fi_info *base_info, struct fi_info *core_info);
 int rxd_info_to_rxd(uint32_t version, const struct fi_info *core_info,
-		    struct fi_info *info);
+		    const struct fi_info *base_info, struct fi_info *info);
 
 int rxd_fabric(struct fi_fabric_attr *attr,
 	       struct fid_fabric **fabric, void *context);
@@ -449,7 +456,7 @@ size_t rxd_init_msg(void **ptr, const struct iovec *iov, size_t iov_count,
 		    size_t total_len, size_t avail_len);
 static inline void rxd_check_init_cq_data(void **ptr, struct rxd_x_entry *tx_entry,
 			      		  size_t *max_inline)
-{	
+{
 	if (tx_entry->flags & RXD_REMOTE_CQ_DATA) {
 		rxd_init_data_hdr(ptr, tx_entry);
 		*max_inline -= sizeof(tx_entry->cq_entry.data);
@@ -511,4 +518,7 @@ void rxd_cleanup_unexp_msg(struct rxd_unexp_msg *unexp_msg);
 void rxd_cq_report_error(struct rxd_cq *cq, struct fi_cq_err_entry *err_entry);
 void rxd_cq_report_tx_comp(struct rxd_cq *cq, struct rxd_x_entry *tx_entry);
 
+
+int rxd_create_peer(struct rxd_ep *ep, uint64_t rxd_addr);
+
 #endif
diff --git a/deps/libfabric/prov/rxd/src/rxd_atomic.c b/deps/libfabric/prov/rxd/src/rxd_atomic.c
index b1e9d2aeb943cb51846b21e35359b596509be564..c6bc2aae0cf69620cb4dafff8f3acb7d559760a4 100644
--- a/deps/libfabric/prov/rxd/src/rxd_atomic.c
+++ b/deps/libfabric/prov/rxd/src/rxd_atomic.c
@@ -133,8 +133,11 @@ static ssize_t rxd_generic_atomic(struct rxd_ep *rxd_ep,
 
 	if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq))
 		goto out;
-
-	rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr];
+	
+	rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx),
+					     RXD_IDX_OFFSET(addr));
+	if (!rxd_addr)
+		goto out;
 	ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr);
 	if (ret)
 		goto out;
@@ -145,7 +148,7 @@ static ssize_t rxd_generic_atomic(struct rxd_ep *rxd_ep,
 	if (!tx_entry)
 		goto out;
 
-	if (rxd_ep->peers[rxd_addr].peer_addr != FI_ADDR_UNSPEC)
+	if (rxd_peer(rxd_ep, rxd_addr)->peer_addr != RXD_ADDR_INVALID)
 		(void) rxd_start_xfer(rxd_ep, tx_entry);
 
 out:
@@ -234,8 +237,11 @@ static ssize_t rxd_atomic_inject(struct fid_ep *ep_fid, const void *buf,
 
 	if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq))
 		goto out;
+	rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx), 
+					     RXD_IDX_OFFSET(addr));
+	if (!rxd_addr)
+		goto out;
 
-	rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[dest_addr];
 	ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr);
 	if (ret)
 		goto out;
@@ -246,7 +252,7 @@ static ssize_t rxd_atomic_inject(struct fid_ep *ep_fid, const void *buf,
 	if (!tx_entry)
 		goto out;
 
-	if (rxd_ep->peers[rxd_addr].peer_addr == FI_ADDR_UNSPEC)
+	if (rxd_peer(rxd_ep, rxd_addr)->peer_addr == RXD_ADDR_INVALID)
 		goto out;
 
 	(void) rxd_start_xfer(rxd_ep, tx_entry);
diff --git a/deps/libfabric/prov/rxd/src/rxd_attr.c b/deps/libfabric/prov/rxd/src/rxd_attr.c
index 686a2225b5a15d9a87b5937ebccdf78c49ebde99..26b45798c47f7dc190c2bbf766759b2055c0dd26 100644
--- a/deps/libfabric/prov/rxd/src/rxd_attr.c
+++ b/deps/libfabric/prov/rxd/src/rxd_attr.c
@@ -40,11 +40,18 @@
 #define RXD_RX_OP_FLAGS (FI_MULTI_RECV | FI_COMPLETION)
 #define RXD_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM)
 
+#define RXD_MSG_ORDER (FI_ORDER_ATOMIC_RAR | FI_ORDER_ATOMIC_RAW |	\
+		       FI_ORDER_ATOMIC_WAR | FI_ORDER_ATOMIC_WAW |	\
+		       FI_ORDER_RAR | FI_ORDER_RAS | FI_ORDER_RAW |	\
+		       FI_ORDER_RMA_RAR | FI_ORDER_RMA_RAW |		\
+		       FI_ORDER_RMA_WAW | FI_ORDER_SAS | FI_ORDER_SAW |	\
+		       FI_ORDER_WAS | FI_ORDER_WAW)
+
 struct fi_tx_attr rxd_tx_attr = {
 	.caps = RXD_TX_CAPS,
 	.op_flags = RXD_TX_OP_FLAGS,
 	.comp_order = FI_ORDER_NONE,
-	.msg_order = FI_ORDER_SAS,
+	.msg_order = RXD_MSG_ORDER,
 	.inject_size = RXD_MAX_MTU_SIZE - sizeof(struct rxd_base_hdr),
 	.size = (1ULL << RXD_MAX_TX_BITS),
 	.iov_limit = RXD_IOV_LIMIT,
@@ -55,7 +62,7 @@ struct fi_rx_attr rxd_rx_attr = {
 	.caps = RXD_RX_CAPS,
 	.op_flags = RXD_RX_OP_FLAGS,
 	.comp_order = FI_ORDER_NONE,
-	.msg_order = FI_ORDER_SAS,
+	.msg_order = RXD_MSG_ORDER,
 	.total_buffered_recv = 0,
 	.size = (1ULL << RXD_MAX_RX_BITS),
 	.iov_limit = RXD_IOV_LIMIT
diff --git a/deps/libfabric/prov/rxd/src/rxd_av.c b/deps/libfabric/prov/rxd/src/rxd_av.c
index 1d9231e0099eb8db9d764957e045881b38743a40..e4c92d03bd67464ee1adcb88db3f010883ad6ea6 100644
--- a/deps/libfabric/prov/rxd/src/rxd_av.c
+++ b/deps/libfabric/prov/rxd/src/rxd_av.c
@@ -40,11 +40,14 @@ static int rxd_tree_compare(struct ofi_rbmap *map, void *key, void *data)
 	uint8_t addr[RXD_NAME_LENGTH];
 	size_t len = RXD_NAME_LENGTH;
 	int ret;
+	fi_addr_t dg_addr;
 
 	memset(addr, 0, len);
 	av = container_of(map, struct rxd_av, rbmap);
-	ret = fi_av_lookup(av->dg_av, av->rxd_addr_table[(fi_addr_t) data].dg_addr,
-			   addr, &len);
+	dg_addr = (intptr_t)ofi_idx_lookup(&av->rxdaddr_dg_idx, 
+					   (fi_addr_t) data);
+
+	ret = fi_av_lookup(av->dg_av, dg_addr,addr, &len);
 	if (ret)
 		return -1;
 
@@ -105,43 +108,53 @@ close:
 
 static fi_addr_t rxd_av_dg_addr(struct rxd_av *av, fi_addr_t fi_addr)
 {
-	fi_addr_t rxd_addr = av->fi_addr_table[fi_addr];
-
-	return rxd_addr == FI_ADDR_UNSPEC ? rxd_addr :
-		av->rxd_addr_table[rxd_addr].dg_addr;
+	fi_addr_t dg_addr;
+	fi_addr_t rxd_addr = (intptr_t) ofi_idx_lookup(&av->fi_addr_idx, 
+					     RXD_IDX_OFFSET(fi_addr));
+	if (!rxd_addr)
+		goto err;
+	dg_addr = (intptr_t) ofi_idx_lookup(&av->rxdaddr_dg_idx, rxd_addr);
+	if (!dg_addr)
+		goto err;
+
+	return dg_addr;
+err:
+	return FI_ADDR_UNSPEC;
 }
 
-static fi_addr_t rxd_set_rxd_addr(struct rxd_av *av, fi_addr_t dg_addr)
+static int rxd_set_rxd_addr(struct rxd_av *av, fi_addr_t dg_addr, fi_addr_t *addr)
 {
-	int tries = 0;
-
-	while (av->rxd_addr_table[av->rxd_addr_idx].dg_addr != FI_ADDR_UNSPEC &&
-	       tries < av->util_av.count) {
-		if (++av->rxd_addr_idx == av->util_av.count)
-			av->rxd_addr_idx = 0;
-		tries++;
-	}
-	assert(av->rxd_addr_idx < av->util_av.count && tries < av->util_av.count);
-	av->rxd_addr_table[av->rxd_addr_idx].dg_addr = dg_addr;
+	int rxdaddr;
+	rxdaddr = ofi_idx_insert(&(av->rxdaddr_dg_idx), (void*)(uintptr_t)dg_addr);
+	if (rxdaddr < 0)
+		return -FI_ENOMEM;
+	*addr = rxdaddr;
+	return 0;
 
-	return av->rxd_addr_idx;
 }
 
 static fi_addr_t rxd_set_fi_addr(struct rxd_av *av, fi_addr_t rxd_addr)
 {
-	int tries = 0;
-
-	while (av->fi_addr_table[av->fi_addr_idx] != FI_ADDR_UNSPEC &&
-	       tries < av->util_av.count) {
-		if (++av->fi_addr_idx == av->util_av.count)
-			av->fi_addr_idx = 0;
-		tries++;
-	}
-	assert(av->fi_addr_idx < av->util_av.count && tries < av->util_av.count);
-	av->fi_addr_table[av->fi_addr_idx] = rxd_addr;
-	av->rxd_addr_table[rxd_addr].fi_addr = av->fi_addr_idx;
-
-	return av->fi_addr_idx;
+	int fi_addr;
+	fi_addr_t dg_addr;
+	fi_addr = ofi_idx_insert(&(av->fi_addr_idx), (void*)(uintptr_t)rxd_addr);
+	if (fi_addr < 0)
+		goto nomem1;
+	
+	if (ofi_idm_set(&(av->rxdaddr_fi_idm), rxd_addr, 
+		        (void*)(uintptr_t) fi_addr) < 0)
+		goto nomem2;
+
+	return fi_addr;
+
+nomem2:
+	ofi_idx_remove_ordered(&(av->fi_addr_idx), fi_addr);
+nomem1:
+	dg_addr = (intptr_t) ofi_idx_remove_ordered(&(av->rxdaddr_dg_idx), 
+						    rxd_addr);
+	fi_av_remove(av->dg_av, &dg_addr, 1, 0);
+
+	return -FI_ENOMEM;
 }
 
 int rxd_av_insert_dg_addr(struct rxd_av *av, const void *addr,
@@ -156,16 +169,24 @@ int rxd_av_insert_dg_addr(struct rxd_av *av, const void *addr,
 	if (ret != 1)
 		return -FI_EINVAL;
 
-	*rxd_addr = rxd_set_rxd_addr(av, dg_addr);
+	ret = rxd_set_rxd_addr(av, dg_addr, rxd_addr);
+	if (ret < 0) {
+		goto nomem;
+	}
 
-	ret = ofi_rbmap_insert(&av->rbmap, (void *) addr, (void *) (*rxd_addr),
+	ret = ofi_rbmap_insert(&av->rbmap, (void *)addr, (void *)(*rxd_addr),
 			       NULL);
 	if (ret) {
 		assert(ret != -FI_EALREADY);
-		fi_av_remove(av->dg_av, &dg_addr, 1, flags);
+		ofi_idx_remove_ordered(&(av->rxdaddr_dg_idx), *rxd_addr);
+		goto nomem;
 	}
 
 	return ret;
+nomem:
+	fi_av_remove(av->dg_av, &dg_addr, 1, flags);
+	return ret;
+
 }
 
 static int rxd_av_insert(struct fid_av *av_fid, const void *addr, size_t count,
@@ -173,7 +194,8 @@ static int rxd_av_insert(struct fid_av *av_fid, const void *addr, size_t count,
 {
 	struct rxd_av *av;
 	int i = 0, ret = 0, success_cnt = 0;
-	fi_addr_t rxd_addr, util_addr;
+	fi_addr_t rxd_addr;
+	int util_addr;
 	struct ofi_rbnode *node;
 
 	av = container_of(av_fid, struct rxd_av, util_av.av_fid);
@@ -195,11 +217,18 @@ static int rxd_av_insert(struct fid_av *av_fid, const void *addr, size_t count,
 				break;
 		}
 
-		util_addr = av->rxd_addr_table[rxd_addr].fi_addr == FI_ADDR_UNSPEC ?
-			    rxd_set_fi_addr(av, rxd_addr) :
-			    av->rxd_addr_table[rxd_addr].fi_addr;
+		util_addr = (intptr_t)ofi_idm_lookup(&av->rxdaddr_fi_idm, 
+						     rxd_addr);
+
+		if (!util_addr) {
+			util_addr = rxd_set_fi_addr(av, rxd_addr);
+			if (util_addr < 0) {
+				ret = util_addr;
+				break;	
+			}
+		}
 		if (fi_addr)
-			fi_addr[i] = util_addr;
+			fi_addr[i] = (util_addr - 1);
 
 		success_cnt++;
 	}
@@ -253,17 +282,23 @@ static int rxd_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count
 	int ret = 0;
 	size_t i, addrlen;
 	fi_addr_t rxd_addr;
+	fi_addr_t dg_addr;
 	struct rxd_av *av;
 	uint8_t addr[RXD_NAME_LENGTH];
 
 	av = container_of(av_fid, struct rxd_av, util_av.av_fid);
 	fastlock_acquire(&av->util_av.lock);
 	for (i = 0; i < count; i++) {
-		rxd_addr = av->fi_addr_table[fi_addr[i]];
 
 		addrlen = RXD_NAME_LENGTH;
-		ret = fi_av_lookup(av->dg_av, av->rxd_addr_table[rxd_addr].dg_addr,
-				   addr, &addrlen);
+		rxd_addr = (intptr_t)ofi_idx_lookup(&av->fi_addr_idx,
+						    RXD_IDX_OFFSET(fi_addr[i]));
+		if (!rxd_addr)
+			goto err;
+		
+		dg_addr = (intptr_t)ofi_idx_lookup(&av->rxdaddr_dg_idx, rxd_addr);
+
+		ret = fi_av_lookup(av->dg_av, dg_addr, addr, &addrlen);
 		if (ret)
 			goto err;
 		
@@ -271,14 +306,15 @@ static int rxd_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count
 		if (ret)
 			goto err;
 
-		ret = fi_av_remove(av->dg_av, &av->rxd_addr_table[rxd_addr].dg_addr,
-				   1, flags);
+		ret = fi_av_remove(av->dg_av, &dg_addr, 1, flags);
+
 		if (ret)
 			goto err;
-
-		av->fi_addr_table[fi_addr[i]] = FI_ADDR_UNSPEC;
-		av->rxd_addr_table[rxd_addr].fi_addr = FI_ADDR_UNSPEC;
-		av->rxd_addr_table[rxd_addr].dg_addr = FI_ADDR_UNSPEC;
+		
+		ofi_idx_remove_ordered(&(av->fi_addr_idx), 
+				       RXD_IDX_OFFSET(fi_addr[i]));
+		ofi_idx_remove_ordered(&(av->rxdaddr_dg_idx), rxd_addr);
+		ofi_idm_clear(&(av->rxdaddr_fi_idm), rxd_addr);
 		av->dg_av_used--;
 	}
 
@@ -327,6 +363,7 @@ static int rxd_av_close(struct fid *fid)
 	struct rxd_av *av;
 	int ret;
 
+
 	av = container_of(fid, struct rxd_av, util_av.av_fid);
 	ret = fi_close(&av->dg_av->fid);
 	if (ret)
@@ -337,8 +374,10 @@ static int rxd_av_close(struct fid *fid)
 	if (ret)
 		return ret;
 
-	free(av->fi_addr_table);
-	free(av->rxd_addr_table);
+	ofi_idx_reset(&(av->fi_addr_idx)); 
+	ofi_idx_reset(&(av->rxdaddr_dg_idx));
+	ofi_idm_reset(&(av->rxdaddr_fi_idm));
+
 	free(av);
 	return 0;
 }
@@ -359,7 +398,7 @@ static struct fi_ops rxd_av_fi_ops = {
 int rxd_av_create(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 		   struct fid_av **av_fid, void *context)
 {
-	int ret, i;
+	int ret;
 	struct rxd_av *av;
 	struct rxd_domain *domain;
 	struct util_av_attr util_attr;
@@ -378,32 +417,23 @@ int rxd_av_create(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 	av = calloc(1, sizeof(*av));
 	if (!av)
 		return -FI_ENOMEM;
-	av->fi_addr_table = calloc(1, attr->count * sizeof(fi_addr_t));
-	av->rxd_addr_table = calloc(1, rxd_env.max_peers * sizeof(struct rxd_addr));
-	if (!av->fi_addr_table || !av->rxd_addr_table) {
-		ret = -FI_ENOMEM;
-		goto err1;
-	}
-
+	memset(&(av->fi_addr_idx), 0, sizeof(av->fi_addr_idx));
+	memset(&(av->rxdaddr_dg_idx), 0, sizeof(av->rxdaddr_dg_idx));
+	memset(&(av->rxdaddr_fi_idm), 0, sizeof(av->rxdaddr_fi_idm));
 
 	util_attr.addrlen = sizeof(fi_addr_t);
+	util_attr.context_len = 0;
 	util_attr.flags = 0;
 	attr->type = domain->util_domain.av_type != FI_AV_UNSPEC ?
 		     domain->util_domain.av_type : FI_AV_TABLE;
 
 	ret = ofi_av_init(&domain->util_domain, attr, &util_attr,
-			 &av->util_av, context);
+			  &av->util_av, context);
 	if (ret)
 		goto err1;
 
 	ofi_rbmap_init(&av->rbmap, rxd_tree_compare);
-	for (i = 0; i < attr->count; av->fi_addr_table[i++] = FI_ADDR_UNSPEC)
-		;
-	for (i = 0; i < rxd_env.max_peers; i++) {
-		av->rxd_addr_table[i].fi_addr = FI_ADDR_UNSPEC;
-		av->rxd_addr_table[i].dg_addr = FI_ADDR_UNSPEC;
-	}
-
+	
 	av_attr = *attr;
 	av_attr.count = 0;
 	av_attr.flags = 0;
@@ -418,9 +448,7 @@ int rxd_av_create(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 
 err2:
 	ofi_av_close(&av->util_av);
-err1:
-	free(av->fi_addr_table);
-	free(av->rxd_addr_table);
+err1:	
 	free(av);
 	return ret;
 }
diff --git a/deps/libfabric/prov/rxd/src/rxd_cq.c b/deps/libfabric/prov/rxd/src/rxd_cq.c
index 0384ead088e580b19cd5e707a7dcb8cafb75bf4d..244dd5e86c980f3fa49fd1174aaeca67e4fcd3eb 100644
--- a/deps/libfabric/prov/rxd/src/rxd_cq.c
+++ b/deps/libfabric/prov/rxd/src/rxd_cq.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2018 Intel Corporation. All rights reserved.
+ * Copyright (c) 2013-2020 Intel Corporation. All rights reserved.
  * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -93,7 +93,7 @@ static int rxd_match_pkt_entry(struct slist_entry *item, const void *arg)
 {
 	return ((struct rxd_pkt_entry *) arg ==
 		container_of(item, struct rxd_pkt_entry, s_entry));
-} 
+}
 
 static void rxd_remove_rx_pkt(struct rxd_ep *ep, struct rxd_pkt_entry *pkt_entry)
 {
@@ -190,8 +190,8 @@ void rxd_ep_recv_data(struct rxd_ep *ep, struct rxd_x_entry *x_entry,
 	x_entry->next_seg_no++;
 
 	if (x_entry->next_seg_no < x_entry->num_segs) {
-		if (!(ep->peers[pkt->base_hdr.peer].rx_seq_no %
-		    ep->peers[pkt->base_hdr.peer].rx_window))
+		if (!(rxd_peer(ep, pkt->base_hdr.peer)->rx_seq_no %
+		    rxd_peer(ep, pkt->base_hdr.peer)->rx_window))
 			rxd_ep_send_ack(ep, pkt->base_hdr.peer);
 		return;
 	}
@@ -207,32 +207,33 @@ static void rxd_verify_active(struct rxd_ep *ep, fi_addr_t addr, fi_addr_t peer_
 {
 	struct rxd_pkt_entry *pkt_entry;
 
-	if (ep->peers[addr].peer_addr != FI_ADDR_UNSPEC &&
-	    ep->peers[addr].peer_addr != peer_addr)
+	if (rxd_peer(ep, addr)->peer_addr != RXD_ADDR_INVALID &&
+	    rxd_peer(ep, addr)->peer_addr != peer_addr)
 		FI_WARN(&rxd_prov, FI_LOG_EP_CTRL,
 			"overwriting active peer - unexpected behavior\n");
 
-	ep->peers[addr].peer_addr = peer_addr;
+	rxd_peer(ep, addr)->peer_addr = peer_addr;
 
-	if (!dlist_empty(&ep->peers[addr].unacked) && 
-	    rxd_get_base_hdr(container_of((&ep->peers[addr].unacked)->next,
+	if (!dlist_empty(&(rxd_peer(ep, addr)->unacked)) &&
+	    rxd_get_base_hdr(container_of((&(rxd_peer(ep, addr)->unacked))->next,
 			     struct rxd_pkt_entry, d_entry))->type == RXD_RTS) {
-		dlist_pop_front(&ep->peers[addr].unacked,
+		dlist_pop_front(&(rxd_peer(ep, addr)->unacked),
 				struct rxd_pkt_entry, pkt_entry, d_entry);
 		if (pkt_entry->flags & RXD_PKT_IN_USE) {
 			dlist_insert_tail(&pkt_entry->d_entry, &ep->ctrl_pkts);
 			pkt_entry->flags |= RXD_PKT_ACKED;
 		} else {
 			ofi_buf_free(pkt_entry);
-			ep->peers[addr].unacked_cnt--;
+			rxd_peer(ep, addr)->unacked_cnt--;
 		}
-		dlist_remove(&ep->peers[addr].entry);
+		dlist_remove(&(rxd_peer(ep, addr)->entry));
 	}
 
-	if (!ep->peers[addr].active) {
-		dlist_insert_tail(&ep->peers[addr].entry, &ep->active_peers);
-		ep->peers[addr].retry_cnt = 0;
-		ep->peers[addr].active = 1;
+	if (!rxd_peer(ep, addr)->active) {
+		dlist_insert_tail(&(rxd_peer(ep, addr)->entry), 
+				  &ep->active_peers);
+		rxd_peer(ep, addr)->retry_cnt = 0;
+		rxd_peer(ep, addr)->active = 1;
 	}
 }
 
@@ -240,17 +241,17 @@ int rxd_start_xfer(struct rxd_ep *ep, struct rxd_x_entry *tx_entry)
 {
 	struct rxd_base_hdr *hdr = rxd_get_base_hdr(tx_entry->pkt);
 
-	if (ep->peers[tx_entry->peer].unacked_cnt >=
-	    ep->peers[tx_entry->peer].tx_window)
+	if (rxd_peer(ep, tx_entry->peer)->unacked_cnt >=
+	    rxd_peer(ep, tx_entry->peer)->tx_window)
 		return 0;
 
-	tx_entry->start_seq = rxd_set_pkt_seq(&ep->peers[tx_entry->peer],
+	tx_entry->start_seq = rxd_set_pkt_seq(rxd_peer(ep, tx_entry->peer),
 					      tx_entry->pkt);
 	if (tx_entry->op != RXD_READ_REQ && tx_entry->num_segs > 1) {
-		ep->peers[tx_entry->peer].tx_seq_no = tx_entry->start_seq +
+		rxd_peer(ep, tx_entry->peer)->tx_seq_no = tx_entry->start_seq +
 						      tx_entry->num_segs;
 	}
-	hdr->peer = ep->peers[tx_entry->peer].peer_addr;
+	hdr->peer = rxd_peer(ep, tx_entry->peer)->peer_addr;
 	rxd_ep_send_pkt(ep, tx_entry->pkt);
 	rxd_insert_unacked(ep, tx_entry->peer, tx_entry->pkt);
 	tx_entry->pkt = NULL;
@@ -259,11 +260,11 @@ int rxd_start_xfer(struct rxd_ep *ep, struct rxd_x_entry *tx_entry)
 	    tx_entry->op == RXD_ATOMIC_COMPARE) {
 		dlist_remove(&tx_entry->entry);
 		dlist_insert_tail(&tx_entry->entry,
-				  &ep->peers[tx_entry->peer].rma_rx_list);
+				  &(rxd_peer(ep, tx_entry->peer)->rma_rx_list));
 	}
 
-	return ep->peers[tx_entry->peer].unacked_cnt <
-	       ep->peers[tx_entry->peer].tx_window;
+	return rxd_peer(ep, tx_entry->peer)->unacked_cnt <
+	       rxd_peer(ep,tx_entry->peer)->tx_window;
 }
 
 void rxd_progress_tx_list(struct rxd_ep *ep, struct rxd_peer *peer)
@@ -279,7 +280,7 @@ void rxd_progress_tx_list(struct rxd_ep *ep, struct rxd_peer *peer)
 					    struct rxd_pkt_entry, d_entry))->seq_no;
 	}
 
-	if (peer->peer_addr == FI_ADDR_UNSPEC)
+	if (peer->peer_addr == RXD_ADDR_INVALID)
 		return;
 
 	dlist_foreach_container_safe(&peer->tx_list, struct rxd_x_entry,
@@ -302,14 +303,14 @@ void rxd_progress_tx_list(struct rxd_ep *ep, struct rxd_peer *peer)
 			}
 			continue;
 		}
-				
+
 		if (tx_entry->op == RXD_DATA_READ && !tx_entry->bytes_done) {
-			if (ep->peers[tx_entry->peer].unacked_cnt >=
-		    	    ep->peers[tx_entry->peer].tx_window) {
+			if (rxd_peer(ep, tx_entry->peer)->unacked_cnt >=
+		    	    rxd_peer(ep, tx_entry->peer)->tx_window) {
 				break;
-			} 
-			tx_entry->start_seq = ep->peers[tx_entry->peer].tx_seq_no;
-			ep->peers[tx_entry->peer].tx_seq_no = tx_entry->start_seq +
+			}
+			tx_entry->start_seq = rxd_peer(ep,tx_entry->peer)->tx_seq_no;
+			rxd_peer(ep, tx_entry->peer)->tx_seq_no = tx_entry->start_seq +
 							      tx_entry->num_segs;
 			inc = 1;
 		}
@@ -317,7 +318,7 @@ void rxd_progress_tx_list(struct rxd_ep *ep, struct rxd_peer *peer)
 		ret = rxd_ep_post_data_pkts(ep, tx_entry);
 		if (ret) {
 			if (ret == -FI_ENOMEM && inc)
-				ep->peers[tx_entry->peer].tx_seq_no -=
+				rxd_peer(ep, tx_entry->peer)->tx_seq_no -=
 							  tx_entry->num_segs;
 			break;
 		}
@@ -330,7 +331,7 @@ void rxd_progress_tx_list(struct rxd_ep *ep, struct rxd_peer *peer)
 static void rxd_update_peer(struct rxd_ep *ep, fi_addr_t peer, fi_addr_t peer_addr)
 {
 	rxd_verify_active(ep, peer, peer_addr);
-	rxd_progress_tx_list(ep, &ep->peers[peer]);
+	rxd_progress_tx_list(ep, rxd_peer(ep, peer));
 }
 
 static int rxd_send_cts(struct rxd_ep *rxd_ep, struct rxd_rts_pkt *rts_pkt,
@@ -438,6 +439,11 @@ static void rxd_handle_rts(struct rxd_ep *ep, struct rxd_pkt_entry *pkt_entry)
 			return;
 	}
 
+	if (!rxd_peer(ep, rxd_addr)) {
+		if (rxd_create_peer(ep, rxd_addr) < 0)
+			return;
+	}
+
 	if (rxd_send_cts(ep, pkt, rxd_addr)) {
 		FI_WARN(&rxd_prov, FI_LOG_EP_CTRL,
 			"error posting CTS\n");
@@ -512,18 +518,17 @@ static struct rxd_x_entry *rxd_match_rx(struct rxd_ep *ep,
 	}
 
 	if (!match) {
-		assert(!ep->peers[base->peer].curr_unexp);
+		assert(!rxd_peer(ep, base->peer)->curr_unexp);
 		unexp_msg = rxd_init_unexp(ep, pkt_entry, base, op,
 					   tag, data, msg, msg_size);
 		if (unexp_msg) {
 			dlist_insert_tail(&unexp_msg->entry, unexp_list);
-			ep->peers[base->peer].curr_unexp = unexp_msg;
+			rxd_peer(ep, base->peer)->curr_unexp = unexp_msg;
 		}
 		return NULL;
 	}
 
 	rx_entry = container_of(match, struct rxd_x_entry, entry);
-
 	total_size = op ? op->size : msg_size;
 
 	if (rx_entry->flags & RXD_MULTI_RECV) {
@@ -556,7 +561,7 @@ static int rxd_verify_iov(struct rxd_ep *ep, struct ofi_rma_iov *rma,
 		iov[i].iov_len = rma[i].len;
 		if (ret) {
 			FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "could not verify MR\n");
-			return -FI_EACCES; 
+			return -FI_EACCES;
 		}
 	}
 	return 0;
@@ -594,9 +599,9 @@ static struct rxd_x_entry *rxd_rma_read_entry_init(struct rxd_ep *ep,
 	rx_entry->cq_entry.flags = ofi_rx_cq_flags(RXD_READ_REQ);
 	rx_entry->cq_entry.len = sar_hdr->size;
 
-	dlist_insert_tail(&rx_entry->entry, &ep->peers[rx_entry->peer].tx_list);
+	dlist_insert_tail(&rx_entry->entry, &(rxd_peer(ep, rx_entry->peer)->tx_list));
 
-	rxd_progress_tx_list(ep, &ep->peers[rx_entry->peer]);
+	rxd_progress_tx_list(ep, rxd_peer(ep, rx_entry->peer));
 
 	return rx_entry;
 }
@@ -669,11 +674,11 @@ static struct rxd_x_entry *rxd_rx_atomic_fetch(struct rxd_ep *ep,
 	if (rx_entry->bytes_done != rx_entry->cq_entry.len)
 		FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "fetch data length mismatch\n");
 
-	dlist_insert_tail(&rx_entry->entry, &ep->peers[rx_entry->peer].tx_list);
+	dlist_insert_tail(&rx_entry->entry, &(rxd_peer(ep, rx_entry->peer)->tx_list));
 
 	rxd_ep_send_ack(ep, base_hdr->peer);
 
-	rxd_progress_tx_list(ep, &ep->peers[rx_entry->peer]);
+	rxd_progress_tx_list(ep, rxd_peer(ep, rx_entry->peer));
 
 	return rx_entry;
 }
@@ -781,11 +786,11 @@ void rxd_do_atomic(void *src, void *dst, void *cmp, enum fi_datatype datatype,
 {
 	char tmp_result[RXD_MAX_MTU_SIZE];
 
-	if (atomic_op >= OFI_SWAP_OP_START) {
-		ofi_atomic_swap_handlers[atomic_op - OFI_SWAP_OP_START][datatype](dst,
-			src, cmp, tmp_result, cnt);
-	} else if (atomic_op != FI_ATOMIC_READ) {
-		ofi_atomic_write_handlers[atomic_op][datatype](dst, src, cnt);
+	if (ofi_atomic_isswap_op(atomic_op)) {
+		ofi_atomic_swap_handler(atomic_op, datatype, dst, src, cmp,
+					tmp_result, cnt);
+	} else if (ofi_atomic_iswrite_op(atomic_op)) {
+		ofi_atomic_write_handler(atomic_op, datatype, dst, src, cnt);
 	}
 }
 
@@ -802,22 +807,30 @@ void rxd_progress_atom_op(struct rxd_ep *ep, struct rxd_x_entry *rx_entry,
 			  void **msg, size_t msg_size)
 {
 	char *src, *cmp;
-	size_t len;
+	size_t data_size, len;
 	int i, iov_count;
 
 	src = (char *) (*msg);
-	cmp = base_hdr->type == RXD_ATOMIC_COMPARE ? (char *) (*msg) +
-		(msg_size / 2) : NULL;
-
+	cmp = base_hdr->type == RXD_ATOMIC_COMPARE ? src + (msg_size / 2) : NULL;
 	iov_count = sar_hdr ? sar_hdr->iov_count : 1;
-	for (i = len = 0; i < iov_count; i++) {
+
+	data_size = ofi_datatype_size(atom_hdr->datatype);
+	if (!data_size) {
+		FI_WARN(&rxd_prov, FI_LOG_EP_DATA,
+			"Invalid atomic datatype received\n");
+		len = ofi_total_iov_len(rx_entry->iov, iov_count);
+		goto out;
+	}
+
+	for (i = 0, len = 0; i < iov_count; i++) {
 		rxd_do_atomic(&src[len], rx_entry->iov[i].iov_base,
-			      cmp ? &cmp[len] : NULL, atom_hdr->datatype,
-			      atom_hdr->atomic_op, rx_entry->iov[i].iov_len /
-			      ofi_datatype_size(atom_hdr->datatype));
+			      cmp ? &cmp[len] : NULL,
+			      atom_hdr->datatype, atom_hdr->atomic_op,
+			      rx_entry->iov[i].iov_len / data_size);
 		len += rx_entry->iov[i].iov_len;
 	}
 
+out:
 	if (base_hdr->type == RXD_ATOMIC)
 		rx_entry->bytes_done = len;
 }
@@ -833,9 +846,9 @@ void rxd_progress_op(struct rxd_ep *ep, struct rxd_x_entry *rx_entry,
 		     void **msg, size_t size)
 {
 	if (sar_hdr)
-		ep->peers[base_hdr->peer].curr_tx_id = sar_hdr->tx_id;
+		rxd_peer(ep, base_hdr->peer)->curr_tx_id = sar_hdr->tx_id;
 
-	ep->peers[base_hdr->peer].curr_rx_id = rx_entry->rx_id;
+	rxd_peer(ep, base_hdr->peer)->curr_rx_id = rx_entry->rx_id;
 
 	if (base_hdr->type == RXD_READ_REQ)
 		return;
@@ -869,7 +882,7 @@ void rxd_progress_op(struct rxd_ep *ep, struct rxd_x_entry *rx_entry,
 	rx_entry->next_seg_no++;
 	rx_entry->start_seq = base_hdr->seq_no;
 
-	dlist_insert_tail(&rx_entry->entry, &ep->peers[base_hdr->peer].rx_list);
+	dlist_insert_tail(&rx_entry->entry, &(rxd_peer(ep, base_hdr->peer)->rx_list));
 }
 
 static struct rxd_x_entry *rxd_get_data_x_entry(struct rxd_ep *ep,
@@ -877,7 +890,7 @@ static struct rxd_x_entry *rxd_get_data_x_entry(struct rxd_ep *ep,
 {
 	if (data_pkt->base_hdr.type == RXD_DATA)
 		return ofi_bufpool_get_ibuf(ep->rx_entry_pool.pool,
-			     ep->peers[data_pkt->base_hdr.peer].curr_rx_id);
+			     rxd_peer(ep, data_pkt->base_hdr.peer)->curr_rx_id);
 
 	return ofi_bufpool_get_ibuf(ep->tx_entry_pool.pool, data_pkt->ext_hdr.tx_id);
 }
@@ -897,14 +910,15 @@ static void rxd_progress_buf_pkts(struct rxd_ep *ep, fi_addr_t peer)
 	size_t msg_size;
 	struct rxd_x_entry *rx_entry = NULL;
 	struct rxd_data_pkt *data_pkt;
+	struct dlist_entry *bufpkts;
 
-	while (!dlist_empty(&ep->peers[peer].buf_pkts)) {
-		pkt_entry = container_of((&ep->peers[peer].buf_pkts)->next,
-					struct rxd_pkt_entry, d_entry);
+	bufpkts = &(rxd_peer(ep, peer)->buf_pkts);
+	while (!dlist_empty(bufpkts)) {
+		pkt_entry = container_of(bufpkts->next, struct rxd_pkt_entry,
+					 d_entry);
 		base_hdr = rxd_get_base_hdr(pkt_entry);
-		if (base_hdr->seq_no != ep->peers[peer].rx_seq_no)
+		if (base_hdr->seq_no != rxd_peer(ep, peer)->rx_seq_no)
 			return;
-
 		if (base_hdr->type == RXD_DATA || base_hdr->type == RXD_DATA_READ) {
 			data_pkt = (struct rxd_data_pkt *) pkt_entry->pkt;
 			rx_entry = rxd_get_data_x_entry(ep, data_pkt);
@@ -922,14 +936,14 @@ static void rxd_progress_buf_pkts(struct rxd_ep *ep, fi_addr_t peer)
 				if (ret)
 					FI_WARN(&rxd_prov, FI_LOG_EP_CTRL,
 						"could not write error entry\n");
-				ep->peers[base_hdr->peer].rx_seq_no++;
+				rxd_peer(ep, base_hdr->peer)->rx_seq_no++;
 				rxd_remove_free_pkt_entry(pkt_entry);
 				continue;
 			}
 			if (!rx_entry) {
 				if (base_hdr->type == RXD_MSG ||
 				    base_hdr->type == RXD_TAGGED) {
-					ep->peers[base_hdr->peer].rx_seq_no++;
+					rxd_peer(ep, base_hdr->peer)->rx_seq_no++;
 					continue;
 				}
 				break;
@@ -940,7 +954,7 @@ static void rxd_progress_buf_pkts(struct rxd_ep *ep, fi_addr_t peer)
 					atom_hdr, &msg, msg_size);
 		}
 
-		ep->peers[base_hdr->peer].rx_seq_no++;
+		rxd_peer(ep,base_hdr->peer)->rx_seq_no++;
 		rxd_remove_free_pkt_entry(pkt_entry);
 	}
 }
@@ -957,27 +971,31 @@ static void rxd_handle_data(struct rxd_ep *ep, struct rxd_pkt_entry *pkt_entry)
 		goto free;
 	}
 
-	if (pkt->base_hdr.seq_no == ep->peers[pkt->base_hdr.peer].rx_seq_no) {
-		ep->peers[pkt->base_hdr.peer].rx_seq_no++;
+	if (pkt->base_hdr.seq_no == rxd_peer(ep,
+				    pkt->base_hdr.peer)->rx_seq_no) {
+		rxd_peer(ep, pkt->base_hdr.peer)->rx_seq_no++;
 		if (pkt->base_hdr.type == RXD_DATA &&
-		    ep->peers[pkt->base_hdr.peer].curr_unexp) {
-			unexp_msg = ep->peers[pkt->base_hdr.peer].curr_unexp;
+		    rxd_peer(ep, pkt->base_hdr.peer)->curr_unexp) {
+			unexp_msg = rxd_peer(ep, pkt->base_hdr.peer)->curr_unexp;
 			dlist_insert_tail(&pkt_entry->d_entry, &unexp_msg->pkt_list);
 			if (pkt->ext_hdr.seg_no + 1 == unexp_msg->sar_hdr->num_segs - 1) {
-				ep->peers[pkt->base_hdr.peer].curr_unexp = NULL;
+				rxd_peer(ep, pkt->base_hdr.peer)->curr_unexp = NULL;
 				rxd_ep_send_ack(ep, pkt->base_hdr.peer);
 			}
 			return;
 		}
 		x_entry = rxd_get_data_x_entry(ep, pkt);
 		rxd_ep_recv_data(ep, x_entry, pkt, pkt_entry->pkt_size);
-		if (!dlist_empty(&ep->peers[pkt->base_hdr.peer].buf_pkts))
+		if (!dlist_empty(&(rxd_peer(ep, 
+				   pkt->base_hdr.peer)->buf_pkts)))
 			rxd_progress_buf_pkts(ep, pkt->base_hdr.peer);
 	} else if (!rxd_env.retry) {
-		dlist_insert_order(&ep->peers[pkt->base_hdr.peer].buf_pkts,
+		dlist_insert_order(&(rxd_peer(ep, 
+				     pkt->base_hdr.peer)->buf_pkts),
 				   &rxd_comp_pkt_seq_no, &pkt_entry->d_entry);
 		return;
-	} else if (ep->peers[pkt->base_hdr.peer].peer_addr != FI_ADDR_UNSPEC) {
+	} else if (rxd_peer(ep, pkt->base_hdr.peer)->peer_addr != 
+		   RXD_ADDR_INVALID) {
 		rxd_ep_send_ack(ep, pkt->base_hdr.peer);
 	}
 free:
@@ -997,19 +1015,19 @@ static void rxd_handle_op(struct rxd_ep *ep, struct rxd_pkt_entry *pkt_entry)
 	size_t msg_size;
 	int ret;
 
-	if (base_hdr->seq_no != ep->peers[base_hdr->peer].rx_seq_no) {
+	if (base_hdr->seq_no != rxd_peer(ep, base_hdr->peer)->rx_seq_no) {
 		if (!rxd_env.retry) {
-			dlist_insert_order(&ep->peers[base_hdr->peer].buf_pkts,
+			dlist_insert_order(&(rxd_peer(ep, base_hdr->peer)->buf_pkts),
 					   &rxd_comp_pkt_seq_no, &pkt_entry->d_entry);
 			return;
 		}
 
-		if (ep->peers[base_hdr->peer].peer_addr != FI_ADDR_UNSPEC)
+		if (rxd_peer(ep, base_hdr->peer)->peer_addr != RXD_ADDR_INVALID)
 			goto ack;
 		goto release;
 	}
 
-	if (ep->peers[base_hdr->peer].peer_addr == FI_ADDR_UNSPEC)
+	if (rxd_peer(ep, base_hdr->peer)->peer_addr == RXD_ADDR_INVALID)
 		goto release;
 
 	ret = rxd_unpack_init_rx(ep, &rx_entry, pkt_entry, base_hdr, &sar_hdr,
@@ -1020,27 +1038,27 @@ static void rxd_handle_op(struct rxd_ep *ep, struct rxd_pkt_entry *pkt_entry)
 
 	if (!rx_entry) {
 		if (base_hdr->type == RXD_MSG || base_hdr->type == RXD_TAGGED) {
-			if (!ep->peers[base_hdr->peer].curr_unexp)
+			if (!rxd_peer(ep, base_hdr->peer)->curr_unexp)
 				goto ack;
 
-			ep->peers[base_hdr->peer].rx_seq_no++;
+			rxd_peer(ep, base_hdr->peer)->rx_seq_no++;
 
 			if (!sar_hdr)
-				ep->peers[base_hdr->peer].curr_unexp = NULL;
+				rxd_peer(ep, base_hdr->peer)->curr_unexp = NULL;
 
 			rxd_ep_send_ack(ep, base_hdr->peer);
 			return;
 		}
-		ep->peers[base_hdr->peer].rx_window = 0;
+		rxd_peer(ep, base_hdr->peer)->rx_window = 0;
 		goto ack;
 	}
 
-	ep->peers[base_hdr->peer].rx_seq_no++;
-	ep->peers[base_hdr->peer].rx_window = rxd_env.max_unacked;
+	rxd_peer(ep, base_hdr->peer)->rx_seq_no++;
+	rxd_peer(ep, base_hdr->peer)->rx_window = rxd_env.max_unacked;
 	rxd_progress_op(ep, rx_entry, pkt_entry, base_hdr, sar_hdr, tag_hdr,
 			data_hdr, rma_hdr, atom_hdr, &msg, msg_size);
 
-	if (!dlist_empty(&ep->peers[base_hdr->peer].buf_pkts))
+	if (!dlist_empty(&(rxd_peer(ep, base_hdr->peer)->buf_pkts)))
 		rxd_progress_buf_pkts(ep, base_hdr->peer);
 
 ack:
@@ -1069,20 +1087,22 @@ static void rxd_handle_ack(struct rxd_ep *ep, struct rxd_pkt_entry *ack_entry)
 	fi_addr_t peer = ack->base_hdr.peer;
 	struct rxd_base_hdr *hdr;
 
-	ep->peers[peer].tx_window = ack->ext_hdr.rx_id;
+	rxd_peer(ep, peer)->tx_window = ack->ext_hdr.rx_id;
 
-	if (ep->peers[peer].last_rx_ack == ack->base_hdr.seq_no)
+	if (rxd_peer(ep, peer)->last_rx_ack == ack->base_hdr.seq_no)
 		return;
 
-	ep->peers[peer].last_rx_ack = ack->base_hdr.seq_no;
+	rxd_peer(ep, peer)->last_rx_ack = ack->base_hdr.seq_no;
 
-	if (dlist_empty(&ep->peers[peer].unacked))
+	if (dlist_empty(&(rxd_peer(ep, peer)->unacked)))
 		return;
 
-	pkt_entry = container_of((&ep->peers[peer].unacked)->next,
-				struct rxd_pkt_entry, d_entry);
+	pkt_entry = container_of((&(rxd_peer(ep, 
+				    peer)->unacked))->next,
+				 struct rxd_pkt_entry, d_entry);
 
-	while (&pkt_entry->d_entry != &ep->peers[peer].unacked) {
+	while (&pkt_entry->d_entry != &(rxd_peer(ep, 
+				        peer)->unacked)) {
 		hdr = rxd_get_base_hdr(pkt_entry);
 		if (ofi_after_eq(hdr->seq_no, ack->base_hdr.seq_no))
 			break;
@@ -1094,15 +1114,15 @@ static void rxd_handle_ack(struct rxd_ep *ep, struct rxd_pkt_entry *ack_entry)
 			continue;
 		}
 		rxd_remove_free_pkt_entry(pkt_entry);
-		ep->peers[peer].unacked_cnt--;
-		ep->peers[peer].retry_cnt = 0;
+		rxd_peer(ep, peer)->unacked_cnt--;
+		rxd_peer(ep, peer)->retry_cnt = 0;
 
-		pkt_entry = container_of((&ep->peers[peer].unacked)->next,
+		pkt_entry = container_of((&(rxd_peer(ep, peer)->unacked))->next,
 					struct rxd_pkt_entry, d_entry);
 	}
 
-	rxd_progress_tx_list(ep, &ep->peers[ack->base_hdr.peer]);
-} 
+	rxd_progress_tx_list(ep, rxd_peer(ep, ack->base_hdr.peer));
+}
 
 void rxd_handle_send_comp(struct rxd_ep *ep, struct fi_cq_msg_entry *comp)
 {
@@ -1123,8 +1143,8 @@ void rxd_handle_send_comp(struct rxd_ep *ep, struct fi_cq_msg_entry *comp)
 		if (pkt_entry->flags & RXD_PKT_ACKED) {
 			peer = pkt_entry->peer;
 			rxd_remove_free_pkt_entry(pkt_entry);
-			ep->peers[peer].unacked_cnt--;
-			rxd_progress_tx_list(ep, &ep->peers[peer]);
+			rxd_peer(ep, peer)->unacked_cnt--;
+			rxd_progress_tx_list(ep, rxd_peer(ep, peer));
 		} else {
 			pkt_entry->flags &= ~RXD_PKT_IN_USE;
 		}
@@ -1182,7 +1202,7 @@ void rxd_handle_error(struct rxd_ep *ep)
 	} else {
 		FI_WARN(&rxd_prov, FI_LOG_CQ,
 			"Received %s error from core provider: %s\n",
-			err.flags & FI_SEND ? "tx" : "rx", fi_strerror(-err.err)); 
+			err.flags & FI_SEND ? "tx" : "rx", fi_strerror(-err.err));
 	}
 }
 
diff --git a/deps/libfabric/prov/rxd/src/rxd_domain.c b/deps/libfabric/prov/rxd/src/rxd_domain.c
index 92deff7de339a1ec929fcffbf65751c8a600f74e..96c1e7982ca3bb1f55391a4676ca5bc3a7fc174c 100644
--- a/deps/libfabric/prov/rxd/src/rxd_domain.c
+++ b/deps/libfabric/prov/rxd/src/rxd_domain.c
@@ -114,7 +114,7 @@ int rxd_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 		return -FI_ENOMEM;
 
 	ret = ofi_get_core_info(fabric->api_version, NULL, NULL,
-				0, &rxd_util_prov, info,
+				0, &rxd_util_prov, info, NULL,
 				rxd_info_to_core, &dg_info);
 	if (ret)
 		goto err1;
diff --git a/deps/libfabric/prov/rxd/src/rxd_ep.c b/deps/libfabric/prov/rxd/src/rxd_ep.c
index 90190b3a5ba864e861a1dfeb7bd97bf41caaa60d..bdbb7c3a52e928806ddbedd7c2f398febf563505 100644
--- a/deps/libfabric/prov/rxd/src/rxd_ep.c
+++ b/deps/libfabric/prov/rxd/src/rxd_ep.c
@@ -305,7 +305,7 @@ void rxd_init_data_pkt(struct rxd_ep *ep, struct rxd_x_entry *tx_entry,
 	data_pkt->ext_hdr.rx_id = tx_entry->rx_id;
 	data_pkt->ext_hdr.tx_id = tx_entry->tx_id;
 	data_pkt->ext_hdr.seg_no = tx_entry->next_seg_no++;
-	data_pkt->base_hdr.peer = ep->peers[tx_entry->peer].peer_addr;
+	data_pkt->base_hdr.peer = rxd_peer(ep, tx_entry->peer)->peer_addr;
 
 	pkt_entry->pkt_size = ofi_copy_from_iov(data_pkt->msg, seg_size,
 						tx_entry->iov,
@@ -360,7 +360,7 @@ struct rxd_x_entry *rxd_tx_entry_init_common(struct rxd_ep *ep, fi_addr_t addr,
 	rxd_init_base_hdr(ep, &(*ptr), tx_entry);
 
 	dlist_insert_tail(&tx_entry->entry,
-			  &ep->peers[tx_entry->peer].tx_list);
+			  &(rxd_peer(ep, tx_entry->peer)->tx_list));
 
 	return tx_entry;
 }
@@ -377,8 +377,8 @@ void rxd_insert_unacked(struct rxd_ep *ep, fi_addr_t peer,
 			struct rxd_pkt_entry *pkt_entry)
 {
 	dlist_insert_tail(&pkt_entry->d_entry,
-			  &ep->peers[peer].unacked);
-	ep->peers[peer].unacked_cnt++;
+			  &(rxd_peer(ep, peer)->unacked));
+	rxd_peer(ep, peer)->unacked_cnt++;
 }
 
 ssize_t rxd_ep_post_data_pkts(struct rxd_ep *ep, struct rxd_x_entry *tx_entry)
@@ -387,8 +387,8 @@ ssize_t rxd_ep_post_data_pkts(struct rxd_ep *ep, struct rxd_x_entry *tx_entry)
 	struct rxd_data_pkt *data;
 
 	while (tx_entry->bytes_done != tx_entry->cq_entry.len) {
-		if (ep->peers[tx_entry->peer].unacked_cnt >=
-		    ep->peers[tx_entry->peer].tx_window)
+		if (rxd_peer(ep, tx_entry->peer)->unacked_cnt >=
+		    rxd_peer(ep, tx_entry->peer)->tx_window)
 			return 0;
 
 		pkt_entry = rxd_get_tx_pkt(ep);
@@ -407,19 +407,20 @@ ssize_t rxd_ep_post_data_pkts(struct rxd_ep *ep, struct rxd_x_entry *tx_entry)
 		rxd_insert_unacked(ep, tx_entry->peer, pkt_entry);
 	}
 
-	return ep->peers[tx_entry->peer].unacked_cnt >=
-	       ep->peers[tx_entry->peer].tx_window;
+	return rxd_peer(ep, tx_entry->peer)->unacked_cnt >=
+	       rxd_peer(ep, tx_entry->peer)->tx_window;
 }
 
 int rxd_ep_send_pkt(struct rxd_ep *ep, struct rxd_pkt_entry *pkt_entry)
 {
 	int ret;
-
+	fi_addr_t dg_addr;
 	pkt_entry->timestamp = ofi_gettime_ms();
 
+	dg_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(ep)->rxdaddr_dg_idx),
+					    pkt_entry->peer);
 	ret = fi_send(ep->dg_ep, (const void *) rxd_pkt_start(pkt_entry),
-		      pkt_entry->pkt_size, pkt_entry->desc,
-		      rxd_ep_av(ep)->rxd_addr_table[pkt_entry->peer].dg_addr,
+		      pkt_entry->pkt_size, pkt_entry->desc, dg_addr,
 		      &pkt_entry->context);
 	if (ret) {
 		FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "error sending packet: %d (%s)\n",
@@ -461,15 +462,21 @@ static ssize_t rxd_ep_send_rts(struct rxd_ep *rxd_ep, fi_addr_t rxd_addr)
 
 	rxd_ep_send_pkt(rxd_ep, pkt_entry);
 	rxd_insert_unacked(rxd_ep, rxd_addr, pkt_entry);
-	dlist_insert_tail(&rxd_ep->peers[rxd_addr].entry, &rxd_ep->rts_sent_list);
+	dlist_insert_tail(&(rxd_peer(rxd_ep, rxd_addr)->entry), 
+			  &rxd_ep->rts_sent_list);
 
 	return 0;
 }
 
 ssize_t rxd_send_rts_if_needed(struct rxd_ep *ep, fi_addr_t addr)
 {
-	if (ep->peers[addr].peer_addr == FI_ADDR_UNSPEC &&
-	    dlist_empty(&ep->peers[addr].unacked))
+	if (!rxd_peer(ep, addr)) {
+		if (rxd_create_peer(ep, addr) < 0)
+			return -FI_ENOMEM;
+	}
+
+	if (rxd_peer(ep, addr)->peer_addr == RXD_ADDR_INVALID &&
+	    dlist_empty(&(rxd_peer(ep, addr)->unacked)))
 		return rxd_ep_send_rts(ep, addr);
 	return 0;
 }
@@ -482,7 +489,7 @@ void rxd_init_base_hdr(struct rxd_ep *rxd_ep, void **ptr,
 	hdr->version = RXD_PROTOCOL_VERSION;
 	hdr->type = tx_entry->op;
 	hdr->seq_no = 0;
-	hdr->peer = rxd_ep->peers[tx_entry->peer].peer_addr;
+	hdr->peer = rxd_peer(rxd_ep, tx_entry->peer)->peer_addr;
 	hdr->flags = tx_entry->flags;
 
 	*ptr = (char *) (*ptr) + sizeof(*hdr);
@@ -569,10 +576,10 @@ void rxd_ep_send_ack(struct rxd_ep *rxd_ep, fi_addr_t peer)
 
 	ack->base_hdr.version = RXD_PROTOCOL_VERSION;
 	ack->base_hdr.type = RXD_ACK;
-	ack->base_hdr.peer = rxd_ep->peers[peer].peer_addr;
-	ack->base_hdr.seq_no = rxd_ep->peers[peer].rx_seq_no;
-	ack->ext_hdr.rx_id = rxd_ep->peers[peer].rx_window;
-	rxd_ep->peers[peer].last_tx_ack = ack->base_hdr.seq_no;
+	ack->base_hdr.peer = rxd_peer(rxd_ep, peer)->peer_addr;
+	ack->base_hdr.seq_no = rxd_peer(rxd_ep, peer)->rx_seq_no;
+	ack->ext_hdr.rx_id = rxd_peer(rxd_ep, peer)->rx_window;
+	rxd_peer(rxd_ep, peer)->last_tx_ack = ack->base_hdr.seq_no;
 
 	dlist_insert_tail(&pkt_entry->d_entry, &rxd_ep->ctrl_pkts);
 	if (rxd_ep_send_pkt(rxd_ep, pkt_entry))
@@ -688,7 +695,8 @@ static int rxd_ep_close(struct fid *fid)
 				pkt_entry, d_entry);
 		ofi_buf_free(pkt_entry);
 	}
-
+	
+	ofi_idm_reset(&(ep->peers_idm));	
 	rxd_ep_free_res(ep);
 	ofi_endpoint_close(&ep->util_ep);
 	free(ep);
@@ -1136,23 +1144,37 @@ err:
 	return ret;
 }
 
-static void rxd_init_peer(struct rxd_ep *ep, uint64_t rxd_addr)
+int rxd_create_peer(struct rxd_ep *ep, uint64_t rxd_addr)
 {
-	ep->peers[rxd_addr].peer_addr = FI_ADDR_UNSPEC;
-	ep->peers[rxd_addr].tx_seq_no = 0;
-	ep->peers[rxd_addr].rx_seq_no = 0;
-	ep->peers[rxd_addr].last_rx_ack = 0;
-	ep->peers[rxd_addr].last_tx_ack = 0;
-	ep->peers[rxd_addr].rx_window = rxd_env.max_unacked;
-	ep->peers[rxd_addr].tx_window = rxd_env.max_unacked;
-	ep->peers[rxd_addr].unacked_cnt = 0;
-	ep->peers[rxd_addr].retry_cnt = 0;
-	ep->peers[rxd_addr].active = 0;
-	dlist_init(&ep->peers[rxd_addr].unacked);
-	dlist_init(&ep->peers[rxd_addr].tx_list);
-	dlist_init(&ep->peers[rxd_addr].rx_list);
-	dlist_init(&ep->peers[rxd_addr].rma_rx_list);
-	dlist_init(&ep->peers[rxd_addr].buf_pkts);
+
+	struct rxd_peer *peer;	
+	peer = calloc(1, sizeof(struct rxd_peer));
+	if (!peer)
+		return -FI_ENOMEM;	
+
+	peer->peer_addr = RXD_ADDR_INVALID;
+	peer->tx_seq_no = 0;
+	peer->rx_seq_no = 0;
+	peer->last_rx_ack = 0;
+	peer->last_tx_ack = 0;
+	peer->rx_window = rxd_env.max_unacked;
+	peer->tx_window = rxd_env.max_unacked;
+	peer->unacked_cnt = 0;
+	peer->retry_cnt = 0;
+	peer->active = 0;
+	dlist_init(&(peer->unacked));
+	dlist_init(&(peer->tx_list));
+	dlist_init(&(peer->rx_list));
+	dlist_init(&(peer->rma_rx_list));
+	dlist_init(&(peer->buf_pkts));
+		
+	if (ofi_idm_set(&(ep->peers_idm), rxd_addr, peer) < 0)
+		goto err;
+	
+	return 0;
+err:	
+	free(peer);
+	return -FI_ENOMEM;
 }
 
 int rxd_endpoint(struct fid_domain *domain, struct fi_info *info,
@@ -1161,10 +1183,12 @@ int rxd_endpoint(struct fid_domain *domain, struct fi_info *info,
 	struct fi_info *dg_info;
 	struct rxd_domain *rxd_domain;
 	struct rxd_ep *rxd_ep;
-	int ret, i;
+	int ret;
+
+
+
+	rxd_ep = calloc(1, sizeof(*rxd_ep));
 
-	rxd_ep = calloc(1, sizeof(*rxd_ep) + sizeof(struct rxd_peer) *
-			rxd_env.max_peers);
 	if (!rxd_ep)
 		return -FI_ENOMEM;
 
@@ -1177,7 +1201,7 @@ int rxd_endpoint(struct fid_domain *domain, struct fi_info *info,
 		goto err1;
 
 	ret = ofi_get_core_info(rxd_domain->util_domain.fabric->fabric_fid.api_version,
-				NULL, NULL, 0, &rxd_util_prov, info,
+				NULL, NULL, 0, &rxd_util_prov, info, NULL,
 				rxd_info_to_core, &dg_info);
 	if (ret)
 		goto err2;
@@ -1205,10 +1229,9 @@ int rxd_endpoint(struct fid_domain *domain, struct fi_info *info,
 	ret = rxd_ep_init_res(rxd_ep, info);
 	if (ret)
 		goto err3;
-
-	for (i = 0; i < rxd_env.max_peers; rxd_init_peer(rxd_ep, i++))
-		;
-
+	
+	memset(&(rxd_ep->peers_idm), 0, sizeof(rxd_ep->peers_idm));
+	
 	rxd_ep->util_ep.ep_fid.fid.ops = &rxd_ep_fi_ops;
 	rxd_ep->util_ep.ep_fid.cm = &rxd_ep_cm;
 	rxd_ep->util_ep.ep_fid.ops = &rxd_ops_ep;
diff --git a/deps/libfabric/prov/rxd/src/rxd_init.c b/deps/libfabric/prov/rxd/src/rxd_init.c
index 058e54662b71526221981f5cbe65d7d8027467bb..0969bff862d973642b94ff101af237098e30f52c 100644
--- a/deps/libfabric/prov/rxd/src/rxd_init.c
+++ b/deps/libfabric/prov/rxd/src/rxd_init.c
@@ -77,7 +77,7 @@ void rxd_info_to_core_mr_modes(uint32_t version, const struct fi_info *hints,
 }
 
 int rxd_info_to_core(uint32_t version, const struct fi_info *rxd_info,
-		     struct fi_info *core_info)
+		     const struct fi_info *base_info, struct fi_info *core_info)
 {
 	rxd_info_to_core_mr_modes(version, rxd_info, core_info);
 	core_info->caps = FI_MSG;
@@ -88,7 +88,7 @@ int rxd_info_to_core(uint32_t version, const struct fi_info *rxd_info,
 }
 
 int rxd_info_to_rxd(uint32_t version, const struct fi_info *core_info,
-		    struct fi_info *info)
+		    const struct fi_info *base_info, struct fi_info *info)
 {
 	info->caps = ofi_pick_core_flags(rxd_info.caps, core_info->caps,
 					 FI_LOCAL_COMM | FI_REMOTE_COMM);
diff --git a/deps/libfabric/prov/rxd/src/rxd_msg.c b/deps/libfabric/prov/rxd/src/rxd_msg.c
index 5ad6be747741040208c58c66ef9541d2f511e253..58239d7e82928c65d23b27ec5f6ffc67b26a80ed 100644
--- a/deps/libfabric/prov/rxd/src/rxd_msg.c
+++ b/deps/libfabric/prov/rxd/src/rxd_msg.c
@@ -77,7 +77,7 @@ static void rxd_progress_unexp_msg(struct rxd_ep *ep, struct rxd_x_entry *rx_ent
 {
 	struct rxd_pkt_entry *pkt_entry;
 	uint64_t num_segs = 0;
-	uint16_t curr_id = ep->peers[unexp_msg->base_hdr->peer].curr_rx_id;
+	uint16_t curr_id = rxd_peer(ep, unexp_msg->base_hdr->peer)->curr_rx_id;
 
 	rxd_progress_op(ep, rx_entry, unexp_msg->pkt_entry, unexp_msg->base_hdr,
 			unexp_msg->sar_hdr, unexp_msg->tag_hdr,
@@ -93,11 +93,11 @@ static void rxd_progress_unexp_msg(struct rxd_ep *ep, struct rxd_x_entry *rx_ent
 		num_segs++;
 	}
 
-	if (ep->peers[unexp_msg->base_hdr->peer].curr_unexp) {
+	if (rxd_peer(ep, unexp_msg->base_hdr->peer)->curr_unexp) {
 		if (!unexp_msg->sar_hdr || num_segs == unexp_msg->sar_hdr->num_segs - 1)
-			ep->peers[unexp_msg->base_hdr->peer].curr_rx_id = curr_id;
+			rxd_peer(ep, unexp_msg->base_hdr->peer)->curr_rx_id = curr_id;
 		else
-			ep->peers[unexp_msg->base_hdr->peer].curr_unexp = NULL;
+			rxd_peer(ep, unexp_msg->base_hdr->peer)->curr_unexp = NULL;
 	}
 
 	rxd_free_unexp_msg(unexp_msg);
@@ -143,8 +143,9 @@ static int rxd_ep_discard_recv(struct rxd_ep *rxd_ep, void *context,
 	assert(unexp_msg->tag_hdr);
 	seq += unexp_msg->sar_hdr ? unexp_msg->sar_hdr->num_segs : 1;
 
-	rxd_ep->peers[unexp_msg->base_hdr->peer].rx_seq_no =
-			MAX(seq, rxd_ep->peers[unexp_msg->base_hdr->peer].rx_seq_no);
+	rxd_peer(rxd_ep, unexp_msg->base_hdr->peer)->rx_seq_no =
+			MAX(seq, rxd_peer(rxd_ep, 
+				 unexp_msg->base_hdr->peer)->rx_seq_no);
 	rxd_ep_send_ack(rxd_ep, unexp_msg->base_hdr->peer);
 
 	ret = ofi_cq_write(rxd_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV,
@@ -201,11 +202,14 @@ ssize_t rxd_ep_generic_recvmsg(struct rxd_ep *rxd_ep, const struct iovec *iov,
 	struct rxd_x_entry *rx_entry;
 	struct dlist_entry *unexp_list, *rx_list;
 	struct rxd_unexp_msg *unexp_msg;
+	fi_addr_t rxd_addr = RXD_ADDR_INVALID;
 
+	
 	assert(iov_count <= RXD_IOV_LIMIT);
 	assert(!(rxd_flags & RXD_MULTI_RECV) || iov_count == 1);
 	assert(!(flags & FI_PEEK) || op == RXD_TAGGED);
 
+
 	fastlock_acquire(&rxd_ep->util_ep.lock);
 
 	if (ofi_cirque_isfull(rxd_ep->util_ep.rx_cq->cirq)) {
@@ -220,19 +224,22 @@ ssize_t rxd_ep_generic_recvmsg(struct rxd_ep *rxd_ep, const struct iovec *iov,
 		unexp_list = &rxd_ep->unexp_list;
 		rx_list = &rxd_ep->rx_list;
 	}
+	
+	if (rxd_ep->util_ep.caps & FI_DIRECTED_RECV &&
+	    addr != FI_ADDR_UNSPEC) {
+		rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx), 
+						     RXD_IDX_OFFSET(addr));
+	}
 
 	if (flags & FI_PEEK) {
-		ret = rxd_peek_recv(rxd_ep, addr, tag, ignore, context, flags,
+		ret = rxd_peek_recv(rxd_ep, rxd_addr, tag, ignore, context, flags,
 				    unexp_list);
 		goto out;
 	}
-
 	if (!(flags & FI_DISCARD)) {
-		rx_entry = rxd_rx_entry_init(rxd_ep, iov, iov_count, tag, ignore, context,
-					(rxd_ep->util_ep.caps & FI_DIRECTED_RECV &&
-					addr != FI_ADDR_UNSPEC) ?
-					rxd_ep_av(rxd_ep)->fi_addr_table[addr] :
-					FI_ADDR_UNSPEC, op, rxd_flags);
+		
+		rx_entry = rxd_rx_entry_init(rxd_ep, iov, iov_count, tag, ignore,
+					     context, rxd_addr, op, rxd_flags);
 		if (!rx_entry) {
 			ret = -FI_EAGAIN;
 		} else if (flags & FI_CLAIM) {
@@ -358,8 +365,12 @@ ssize_t rxd_ep_generic_inject(struct rxd_ep *rxd_ep, const struct iovec *iov,
 
 	if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq))
 		goto out;
+	
+	rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx), 
+					     RXD_IDX_OFFSET(addr));
+	if (!rxd_addr)
+		goto out;
 
-	rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr];
 	ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr);
 	if (ret)
 		goto out;
@@ -371,7 +382,7 @@ ssize_t rxd_ep_generic_inject(struct rxd_ep *rxd_ep, const struct iovec *iov,
 		goto out;
 	}
 
-	if (rxd_ep->peers[rxd_addr].peer_addr != FI_ADDR_UNSPEC)
+	if (rxd_peer(rxd_ep, rxd_addr)->peer_addr != RXD_ADDR_INVALID)
 		(void) rxd_start_xfer(rxd_ep, tx_entry);
 
 out:
@@ -398,8 +409,12 @@ ssize_t rxd_ep_generic_sendmsg(struct rxd_ep *rxd_ep, const struct iovec *iov,
 
 	if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq))
 		goto out;
-
-	rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr];
+	
+	rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx), 
+					     RXD_IDX_OFFSET(addr));
+	if (!rxd_addr)
+		goto out;
+	
 	ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr);
 	if (ret)
 		goto out;
@@ -409,7 +424,7 @@ ssize_t rxd_ep_generic_sendmsg(struct rxd_ep *rxd_ep, const struct iovec *iov,
 	if (!tx_entry)
 		goto out;
 
-	if (rxd_ep->peers[rxd_addr].peer_addr == FI_ADDR_UNSPEC)
+	if (rxd_peer(rxd_ep, rxd_addr)->peer_addr == RXD_ADDR_INVALID)
 		goto out;
 
 	ret = rxd_start_xfer(rxd_ep, tx_entry);
diff --git a/deps/libfabric/prov/rxd/src/rxd_rma.c b/deps/libfabric/prov/rxd/src/rxd_rma.c
index bb93d808549468bff2d2fdd2580f32c1c8efafc9..ae9649a0067def280dac2e733a5921195b766f3e 100644
--- a/deps/libfabric/prov/rxd/src/rxd_rma.c
+++ b/deps/libfabric/prov/rxd/src/rxd_rma.c
@@ -100,8 +100,11 @@ static ssize_t rxd_generic_write_inject(struct rxd_ep *rxd_ep,
 
 	if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq))
 		goto out;
-
-	rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr];
+	
+	rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx),
+					      RXD_IDX_OFFSET(addr));
+	if (!rxd_addr)
+		goto out;
 	ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr);
 	if (ret)
 		goto out;
@@ -114,7 +117,7 @@ static ssize_t rxd_generic_write_inject(struct rxd_ep *rxd_ep,
 		goto out;
 	}
 
-	if (rxd_ep->peers[rxd_addr].peer_addr == FI_ADDR_UNSPEC)
+	if (rxd_peer(rxd_ep, rxd_addr)->peer_addr == RXD_ADDR_INVALID)
 		goto out;
 
 	ret = rxd_start_xfer(rxd_ep, tx_entry);
@@ -147,8 +150,11 @@ ssize_t rxd_generic_rma(struct rxd_ep *rxd_ep, const struct iovec *iov,
 
 	if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq))
 		goto out;
+	rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx),
+					     RXD_IDX_OFFSET(addr));
+	if (!rxd_addr)
+		goto out;
 
-	rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr];
 	ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr);
 	if (ret)
 		goto out;
@@ -161,7 +167,7 @@ ssize_t rxd_generic_rma(struct rxd_ep *rxd_ep, const struct iovec *iov,
 		goto out;
 	}
 
-	if (rxd_ep->peers[rxd_addr].peer_addr == FI_ADDR_UNSPEC)
+	if (rxd_peer(rxd_ep, rxd_addr)->peer_addr == RXD_ADDR_INVALID)
 		goto out;
 
 	ret = rxd_start_xfer(rxd_ep, tx_entry);
diff --git a/deps/libfabric/prov/rxm/src/rxm.h b/deps/libfabric/prov/rxm/src/rxm.h
index 69c7c517b825e784e25076228429573c580672cd..0494ffd911eb0de06a428020619bd56ab5e9b9ae 100644
--- a/deps/libfabric/prov/rxm/src/rxm.h
+++ b/deps/libfabric/prov/rxm/src/rxm.h
@@ -126,11 +126,10 @@ extern struct fi_ops_atomic rxm_ops_atomic;
 
 extern size_t rxm_msg_tx_size;
 extern size_t rxm_msg_rx_size;
-extern size_t rxm_def_univ_size;
 extern size_t rxm_cm_progress_interval;
 extern size_t rxm_cq_eq_fairness;
 extern int force_auto_progress;
-extern enum fi_wait_obj def_wait_obj;
+extern enum fi_wait_obj def_wait_obj, def_tcp_wait_obj;
 
 struct rxm_ep;
 
@@ -244,6 +243,9 @@ union rxm_cm_data {
 	} reject;
 };
 
+int rxm_cmap_alloc_handle(struct rxm_cmap *cmap, fi_addr_t fi_addr,
+			  enum rxm_cmap_state state,
+			  struct rxm_cmap_handle **handle);
 struct rxm_cmap_handle *rxm_cmap_key2handle(struct rxm_cmap *cmap, uint64_t key);
 int rxm_cmap_update(struct rxm_cmap *cmap, const void *addr, fi_addr_t fi_addr);
 
@@ -254,7 +256,6 @@ void rxm_cmap_process_shutdown(struct rxm_cmap *cmap,
 			       struct rxm_cmap_handle *handle);
 int rxm_cmap_connect(struct rxm_ep *rxm_ep, fi_addr_t fi_addr,
 		     struct rxm_cmap_handle *handle);
-void rxm_cmap_del_handle_ts(struct rxm_cmap_handle *handle);
 void rxm_cmap_free(struct rxm_cmap *cmap);
 int rxm_cmap_alloc(struct rxm_ep *rxm_ep, struct rxm_cmap_attr *attr);
 int rxm_cmap_remove(struct rxm_cmap *cmap, int index);
@@ -278,6 +279,7 @@ struct rxm_domain {
 	size_t max_atomic_size;
 	uint64_t mr_key;
 	uint8_t mr_local;
+	struct ofi_ops_flow_ctrl *flow_ctrl_ops;
 };
 
 int rxm_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
@@ -333,6 +335,7 @@ struct rxm_atomic_resp_hdr {
 	FUNC(RXM_RMA),			\
 	FUNC(RXM_RX),			\
 	FUNC(RXM_SAR_TX),		\
+	FUNC(RXM_CREDIT_TX),		\
 	FUNC(RXM_RNDV_TX),		\
 	FUNC(RXM_RNDV_ACK_WAIT),	\
 	FUNC(RXM_RNDV_READ),		\
@@ -355,6 +358,7 @@ enum {
 	rxm_ctrl_rndv_ack,
 	rxm_ctrl_atomic,
 	rxm_ctrl_atomic_resp,
+	rxm_ctrl_credit
 };
 
 struct rxm_pkt {
@@ -414,6 +418,7 @@ enum rxm_buf_pool_type {
 	RXM_BUF_POOL_TX_ACK,
 	RXM_BUF_POOL_TX_RNDV,
 	RXM_BUF_POOL_TX_ATOMIC,
+	RXM_BUF_POOL_TX_CREDIT,
 	RXM_BUF_POOL_TX_SAR,
 	RXM_BUF_POOL_TX_END	= RXM_BUF_POOL_TX_SAR,
 	RXM_BUF_POOL_RMA,
@@ -531,6 +536,7 @@ enum rxm_deferred_tx_entry_type {
 	RXM_DEFERRED_TX_RNDV_READ,
 	RXM_DEFERRED_TX_SAR_SEG,
 	RXM_DEFERRED_TX_ATOMIC_RESP,
+	RXM_DEFERRED_TX_CREDIT_SEND,
 };
 
 struct rxm_deferred_tx_entry {
@@ -570,6 +576,9 @@ struct rxm_deferred_tx_entry {
 			struct rxm_tx_atomic_buf *tx_buf;
 			ssize_t len;
 		} atomic_resp;
+		struct {
+			struct rxm_tx_base_buf *tx_buf;
+		} credit_msg;
 	};
 };
 
@@ -637,12 +646,10 @@ struct rxm_msg_eq_entry {
 #define RXM_CM_ENTRY_SZ (sizeof(struct fi_eq_cm_entry) + \
 			 sizeof(union rxm_cm_data))
 
-struct rxm_handle_txrx_ops {
-	int (*comp_eager_tx)(struct rxm_ep *rxm_ep,
-				    struct rxm_tx_eager_buf *tx_eager_buf);
-	ssize_t (*handle_eager_rx)(struct rxm_rx_buf *rx_buf);
-	ssize_t (*handle_rndv_rx)(struct rxm_rx_buf *rx_buf);
-	ssize_t (*handle_seg_data_rx)(struct rxm_rx_buf *rx_buf);
+struct rxm_eager_ops {
+	int (*comp_tx)(struct rxm_ep *rxm_ep,
+		       struct rxm_tx_eager_buf *tx_eager_buf);
+	ssize_t (*handle_rx)(struct rxm_rx_buf *rx_buf);
 };
 
 struct rxm_ep {
@@ -678,7 +685,7 @@ struct rxm_ep {
 	struct rxm_recv_queue	recv_queue;
 	struct rxm_recv_queue	trecv_queue;
 
-	struct rxm_handle_txrx_ops *txrx_ops;
+	struct rxm_eager_ops	*eager_ops;
 };
 
 struct rxm_conn {
@@ -702,7 +709,6 @@ struct rxm_conn {
 };
 
 extern struct fi_provider rxm_prov;
-extern struct fi_info rxm_info;
 extern struct fi_fabric_attr rxm_fabric_attr;
 extern struct fi_domain_attr rxm_domain_attr;
 extern struct fi_tx_attr rxm_tx_attr;
@@ -711,14 +717,14 @@ extern struct fi_rx_attr rxm_rx_attr;
 int rxm_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
 			void *context);
 int rxm_info_to_core(uint32_t version, const struct fi_info *rxm_info,
-		     struct fi_info *core_info);
+		     const struct fi_info *base_info, struct fi_info *core_info);
 int rxm_info_to_rxm(uint32_t version, const struct fi_info *core_info,
-		    struct fi_info *info);
+		     const struct fi_info *base_info, struct fi_info *info);
 int rxm_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 			     struct fid_domain **dom, void *context);
 int rxm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
 			 struct fid_cq **cq_fid, void *context);
-ssize_t rxm_cq_handle_rx_buf(struct rxm_rx_buf *rx_buf);
+ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf);
 
 int rxm_endpoint(struct fid_domain *domain, struct fi_info *info,
 			  struct fid_ep **ep, void *context);
@@ -727,16 +733,14 @@ int rxm_conn_cmap_alloc(struct rxm_ep *rxm_ep);
 void rxm_cq_write_error(struct util_cq *cq, struct util_cntr *cntr,
 			void *op_context, int err);
 void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err);
-void rxm_cq_read_write_error(struct rxm_ep *rxm_ep);
-ssize_t rxm_cq_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp);
+void rxm_handle_comp_error(struct rxm_ep *rxm_ep);
+ssize_t rxm_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp);
 void rxm_ep_progress(struct util_ep *util_ep);
 void rxm_ep_progress_coll(struct util_ep *util_ep);
 void rxm_ep_do_progress(struct util_ep *util_ep);
 
-ssize_t rxm_cq_handle_eager(struct rxm_rx_buf *rx_buf);
-ssize_t rxm_cq_handle_coll_eager(struct rxm_rx_buf *rx_buf);
-ssize_t rxm_cq_handle_rndv(struct rxm_rx_buf *rx_buf);
-ssize_t rxm_cq_handle_seg_data(struct rxm_rx_buf *rx_buf);
+ssize_t rxm_handle_eager(struct rxm_rx_buf *rx_buf);
+ssize_t rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf);
 int rxm_finish_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_eager_buf *tx_eager_buf);
 int rxm_finish_coll_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_eager_buf *tx_eager_buf);
 
@@ -837,31 +841,8 @@ static inline void rxm_cq_log_comp(uint64_t flags)
 #endif
 }
 
-static inline ssize_t
-rxm_ep_prepare_tx(struct rxm_ep *rxm_ep, fi_addr_t dest_addr,
-		  struct rxm_conn **rxm_conn)
-{
-	ssize_t ret;
-
-	assert(rxm_ep->util_ep.tx_cq);
-	*rxm_conn = (struct rxm_conn *)rxm_cmap_acquire_handle(rxm_ep->cmap,
-							       dest_addr);
-	if (OFI_UNLIKELY(!*rxm_conn))
-		return -FI_EHOSTUNREACH;
-
-	if (OFI_UNLIKELY((*rxm_conn)->handle.state != RXM_CMAP_CONNECTED)) {
-		ret = rxm_cmap_connect(rxm_ep, dest_addr, &(*rxm_conn)->handle);
-		if (ret)
-			return ret;
-	}
-
-	if (OFI_UNLIKELY(!dlist_empty(&(*rxm_conn)->deferred_tx_queue))) {
-		rxm_ep_do_progress(&rxm_ep->util_ep);
-		if (!dlist_empty(&(*rxm_conn)->deferred_tx_queue))
-			return -FI_EAGAIN;
-	}
-	return 0;
-}
+ssize_t rxm_get_conn(struct rxm_ep *rxm_ep, fi_addr_t addr,
+		     struct rxm_conn **rxm_conn);
 
 static inline void
 rxm_ep_format_tx_buf_pkt(struct rxm_conn *rxm_conn, size_t len, uint8_t op,
@@ -876,8 +857,7 @@ rxm_ep_format_tx_buf_pkt(struct rxm_conn *rxm_conn, size_t len, uint8_t op,
 	pkt->hdr.data = data;
 }
 
-
-static inline struct rxm_buf *
+static inline void *
 rxm_tx_buf_alloc(struct rxm_ep *rxm_ep, enum rxm_buf_pool_type type)
 {
 	assert((type == RXM_BUF_POOL_TX) ||
@@ -885,11 +865,11 @@ rxm_tx_buf_alloc(struct rxm_ep *rxm_ep, enum rxm_buf_pool_type type)
 	       (type == RXM_BUF_POOL_TX_ACK) ||
 	       (type == RXM_BUF_POOL_TX_RNDV) ||
 	       (type == RXM_BUF_POOL_TX_ATOMIC) ||
+	       (type == RXM_BUF_POOL_TX_CREDIT) ||
 	       (type == RXM_BUF_POOL_TX_SAR));
 	return ofi_buf_alloc(rxm_ep->buf_pools[type].pool);
 }
 
-
 static inline struct rxm_rx_buf *
 rxm_rx_buf_alloc(struct rxm_ep *rxm_ep, struct fid_ep *msg_ep, uint8_t repost)
 {
@@ -919,19 +899,6 @@ rxm_rx_buf_free(struct rxm_rx_buf *rx_buf)
 	}
 }
 
-static inline struct rxm_rma_buf *rxm_rma_buf_alloc(struct rxm_ep *rxm_ep)
-{
-	return (struct rxm_rma_buf *)
-		ofi_buf_alloc(rxm_ep->buf_pools[RXM_BUF_POOL_RMA].pool);
-}
-
-static inline
-struct rxm_tx_atomic_buf *rxm_tx_atomic_buf_alloc(struct rxm_ep *rxm_ep)
-{
-	return (struct rxm_tx_atomic_buf *)
-		rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC);
-}
-
 static inline void
 rxm_recv_entry_release(struct rxm_recv_queue *queue, struct rxm_recv_entry *entry)
 {
diff --git a/deps/libfabric/prov/rxm/src/rxm_atomic.c b/deps/libfabric/prov/rxm/src/rxm_atomic.c
index 3c9094b89d98454514fe4ad331d2529631923a90..de7e234e9da0a81134f275fd4688a45162a6b30d 100644
--- a/deps/libfabric/prov/rxm/src/rxm_atomic.c
+++ b/deps/libfabric/prov/rxm/src/rxm_atomic.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2018 Cray Inc. All rights reserved.
+ * Copyright (c) 2018 System Fabric Works, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -140,8 +141,7 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 		goto restore_credit;
 	}
 
-	tx_buf = (struct rxm_tx_atomic_buf *)
-		 rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC);
+	tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC);
 	if (OFI_UNLIKELY(!tx_buf)) {
 		FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
 			"Ran out of buffers from Atomic buffer pool\n");
@@ -182,13 +182,12 @@ static ssize_t
 rxm_ep_generic_atomic_writemsg(struct rxm_ep *rxm_ep, const struct fi_msg_atomic *msg,
 			       uint64_t flags)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
+	ssize_t ret;
 
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-
-	ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_atomic_common(rxm_ep, rxm_conn, msg, NULL, NULL, 0,
@@ -293,13 +292,12 @@ rxm_ep_generic_atomic_readwritemsg(struct rxm_ep *rxm_ep,
 				   struct fi_ioc *resultv, void **result_desc,
 				   size_t result_count, uint64_t flags)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
+	ssize_t ret;
 
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-
-	ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_atomic_common(rxm_ep, rxm_conn, msg, NULL, NULL, 0,
@@ -387,13 +385,12 @@ rxm_ep_generic_atomic_compwritemsg(struct rxm_ep *rxm_ep,
 				   void **result_desc, size_t result_count,
 				   uint64_t flags)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
+	ssize_t ret;
 
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-
-	ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_atomic_common(rxm_ep, rxm_conn, msg, comparev,
diff --git a/deps/libfabric/prov/rxm/src/rxm_attr.c b/deps/libfabric/prov/rxm/src/rxm_attr.c
index a8fc84e99a6a1d0ac8f8c5d252461c8ccc6cc0be..63fcf69094550860c6a78316734dd2c5d237d5dc 100644
--- a/deps/libfabric/prov/rxm/src/rxm_attr.c
+++ b/deps/libfabric/prov/rxm/src/rxm_attr.c
@@ -38,7 +38,6 @@
 		     FI_MULTI_RECV)
 #define RXM_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM)
 
-// TODO have a separate "check info" against which app hints would be checked.
 
 /* Since we are a layering provider, the attributes for which we rely on the
  * core provider are set to full capability. This ensures that ofix_getinfo
@@ -64,6 +63,25 @@ struct fi_rx_attr rxm_rx_attr = {
 	.iov_limit= RXM_IOV_LIMIT,
 };
 
+struct fi_tx_attr rxm_tx_attr_coll = {
+	.caps = RXM_TX_CAPS | FI_COLLECTIVE,
+	.op_flags = RXM_PASSTHRU_TX_OP_FLAGS | RXM_TX_OP_FLAGS,
+	.msg_order = ~0x0ULL,
+	.comp_order = FI_ORDER_NONE,
+	.size = 1024,
+	.iov_limit = RXM_IOV_LIMIT,
+	.rma_iov_limit = RXM_IOV_LIMIT,
+};
+
+struct fi_rx_attr rxm_rx_attr_coll = {
+	.caps = RXM_RX_CAPS | FI_COLLECTIVE,
+	.op_flags = RXM_PASSTHRU_RX_OP_FLAGS | RXM_RX_OP_FLAGS,
+	.msg_order = ~0x0ULL,
+	.comp_order = FI_ORDER_NONE,
+	.size = 1024,
+	.iov_limit= RXM_IOV_LIMIT,
+};
+
 struct fi_ep_attr rxm_ep_attr = {
 	.type = FI_EP_RDM,
 	.protocol = FI_PROTO_RXM,
@@ -77,6 +95,19 @@ struct fi_ep_attr rxm_ep_attr = {
 	.mem_tag_format = FI_TAG_GENERIC,
 };
 
+struct fi_ep_attr rxm_ep_attr_coll = {
+	.type = FI_EP_RDM,
+	.protocol = FI_PROTO_RXM,
+	.protocol_version = 1,
+	.max_msg_size = SIZE_MAX,
+	.tx_ctx_cnt = 1,
+	.rx_ctx_cnt = 1,
+	.max_order_raw_size = SIZE_MAX,
+	.max_order_war_size = SIZE_MAX,
+	.max_order_waw_size = SIZE_MAX,
+	.mem_tag_format = FI_TAG_GENERIC >> 1,
+};
+
 struct fi_domain_attr rxm_domain_attr = {
 	.caps = RXM_DOMAIN_CAPS,
 	.threading = FI_THREAD_SAFE,
@@ -86,7 +117,8 @@ struct fi_domain_attr rxm_domain_attr = {
 	.av_type = FI_AV_UNSPEC,
 	/* Advertise support for FI_MR_BASIC so that ofi_check_info call
 	 * doesn't fail at RxM level. If an app requires FI_MR_BASIC, it
-	 * would be passed down to core provider. */
+	 * would be passed down to core provider.
+	 */
 	.mr_mode = FI_MR_BASIC | FI_MR_SCALABLE,
 	.cq_data_size = sizeof_field(struct ofi_op_hdr, data),
 	.cq_cnt = (1 << 16),
@@ -102,17 +134,61 @@ struct fi_fabric_attr rxm_fabric_attr = {
 	.prov_version = OFI_VERSION_DEF_PROV,
 };
 
-struct fi_info rxm_info = {
+struct fi_fabric_attr rxm_verbs_fabric_attr = {
+	.prov_version = OFI_VERSION_DEF_PROV,
+	.prov_name = "verbs",
+};
+
+struct fi_fabric_attr rxm_tcp_fabric_attr = {
+	.prov_version = OFI_VERSION_DEF_PROV,
+	.prov_name = "tcp",
+};
+
+struct fi_info rxm_coll_info = {
 	.caps = RXM_TX_CAPS | RXM_RX_CAPS | RXM_DOMAIN_CAPS | FI_COLLECTIVE,
 	.addr_format = FI_SOCKADDR,
+	.tx_attr = &rxm_tx_attr_coll,
+	.rx_attr = &rxm_rx_attr_coll,
+	.ep_attr = &rxm_ep_attr_coll,
+	.domain_attr = &rxm_domain_attr,
+	.fabric_attr = &rxm_fabric_attr
+};
+
+struct fi_info rxm_base_info = {
+	.caps = RXM_TX_CAPS | RXM_RX_CAPS | RXM_DOMAIN_CAPS,
+	.addr_format = FI_SOCKADDR,
 	.tx_attr = &rxm_tx_attr,
 	.rx_attr = &rxm_rx_attr,
 	.ep_attr = &rxm_ep_attr,
 	.domain_attr = &rxm_domain_attr,
-	.fabric_attr = &rxm_fabric_attr
+	.fabric_attr = &rxm_fabric_attr,
+	.next = &rxm_coll_info,
+};
+
+struct fi_info rxm_tcp_info = {
+	.caps = RXM_TX_CAPS | RXM_RX_CAPS | RXM_DOMAIN_CAPS,
+	.addr_format = FI_SOCKADDR,
+	.tx_attr = &rxm_tx_attr,
+	.rx_attr = &rxm_rx_attr,
+	.ep_attr = &rxm_ep_attr,
+	.domain_attr = &rxm_domain_attr,
+	.fabric_attr = &rxm_tcp_fabric_attr,
+	.next = &rxm_base_info,
+};
+
+struct fi_info rxm_verbs_info = {
+	.caps = RXM_TX_CAPS | RXM_RX_CAPS | RXM_DOMAIN_CAPS,
+	.addr_format = FI_SOCKADDR,
+	.tx_attr = &rxm_tx_attr,
+	.rx_attr = &rxm_rx_attr,
+	.ep_attr = &rxm_ep_attr,
+	.domain_attr = &rxm_domain_attr,
+	.fabric_attr = &rxm_verbs_fabric_attr,
+	.next = &rxm_tcp_info,
 };
 
 struct util_prov rxm_util_prov = {
 	.prov = &rxm_prov,
+	.info = &rxm_verbs_info,
 	.flags = 0,
 };
diff --git a/deps/libfabric/prov/rxm/src/rxm_av.c b/deps/libfabric/prov/rxm/src/rxm_av.c
index 2667c9f3cc0c6b7cdceb273d85e881a757ffbb0a..94957fc67b97f7be113200154cc7cdb0ad6a7600 100644
--- a/deps/libfabric/prov/rxm/src/rxm_av.c
+++ b/deps/libfabric/prov/rxm/src/rxm_av.c
@@ -61,6 +61,10 @@ static int rxm_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr,
 	return ofi_ip_av_remove(av_fid, fi_addr, count, flags);
 }
 
+/* TODO: Determine if it's cleaner to insert an address into the cmap only
+ * when we need to send to that address, rather than inserting the address
+ * into the cmap when adding it to the AV.
+ */
 static int
 rxm_av_insert_cmap(struct fid_av *av_fid, const void *addr, size_t count,
 		   fi_addr_t *fi_addr, uint64_t flags)
@@ -145,12 +149,15 @@ static int rxm_av_insertsym(struct fid_av *av_fid, const char *node,
 	assert(ret == count);
 
 	ret = ofi_ip_av_insertv(av, addr, addrlen, count, fi_addr, context);
-	if (ret < 0)
-		goto out;
-
-	if (!av->eq && !ret)
-		goto out;
+	if (!av->eq && ret < count) {
+		count = ret;
+	}
 
+	/* If the AV is bound to an EQ, we can't determine which entries were
+	 * added successfully to the AV until we process the insertion events
+	 * later when reading the EQ.  Add all addresses to the cmap
+	 * optimistically.
+	 */
 	retv = rxm_av_insert_cmap(av_fid, addr, count, fi_addr, flags);
 	if (retv) {
 		ret = rxm_av_remove(av_fid, fi_addr, count, flags);
@@ -159,10 +166,9 @@ static int rxm_av_insertsym(struct fid_av *av_fid, const char *node,
 				"from AV during error handling\n");
 		ret = retv;
 	}
-out:
+
 	free(addr);
 	return ret;
-
 }
 
 int rxm_av_insertsvc(struct fid_av *av, const char *node, const char *service,
diff --git a/deps/libfabric/prov/rxm/src/rxm_conn.c b/deps/libfabric/prov/rxm/src/rxm_conn.c
index 79ae11cb75424b3d76ac0eaba82de84c91a18572..dff5d7dcc0e9e5d620b10d0fe8f007e1ea567ed7 100644
--- a/deps/libfabric/prov/rxm/src/rxm_conn.c
+++ b/deps/libfabric/prov/rxm/src/rxm_conn.c
@@ -175,6 +175,37 @@ static int rxm_cmap_del_handle(struct rxm_cmap_handle *handle)
 	return 0;
 }
 
+ssize_t rxm_get_conn(struct rxm_ep *rxm_ep, fi_addr_t addr,
+		     struct rxm_conn **rxm_conn)
+{
+	struct rxm_cmap_handle *handle;
+	ssize_t ret;
+
+	assert(rxm_ep->util_ep.tx_cq);
+	handle = rxm_cmap_acquire_handle(rxm_ep->cmap, addr);
+	if (!handle) {
+		ret = rxm_cmap_alloc_handle(rxm_ep->cmap, addr,
+					    RXM_CMAP_IDLE, &handle);
+		if (ret)
+			return ret;
+	}
+
+	*rxm_conn = container_of(handle, struct rxm_conn, handle);
+
+	if (handle->state != RXM_CMAP_CONNECTED) {
+		ret = rxm_cmap_connect(rxm_ep, addr, handle);
+		if (ret)
+			return ret;
+	}
+
+	if (!dlist_empty(&(*rxm_conn)->deferred_tx_queue)) {
+		rxm_ep_do_progress(&rxm_ep->util_ep);
+		if (!dlist_empty(&(*rxm_conn)->deferred_tx_queue))
+			return -FI_EAGAIN;
+	}
+	return 0;
+}
+
 static inline int
 rxm_cmap_check_and_realloc_handles_table(struct rxm_cmap *cmap,
 					 fi_addr_t fi_addr)
@@ -268,6 +299,27 @@ static int rxm_conn_res_alloc(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn)
 static void rxm_conn_close(struct rxm_cmap_handle *handle)
 {
 	struct rxm_conn *rxm_conn = container_of(handle, struct rxm_conn, handle);
+	struct rxm_conn *rxm_conn_tmp;
+	struct rxm_deferred_tx_entry *def_tx_entry;
+	struct dlist_entry *conn_entry_tmp;
+
+	dlist_foreach_container_safe(&handle->cmap->ep->deferred_tx_conn_queue,
+				     struct rxm_conn, rxm_conn_tmp,
+				     deferred_conn_entry, conn_entry_tmp)
+	{
+		if (rxm_conn_tmp->handle.key != handle->key)
+			continue;
+
+		while (!dlist_empty(&rxm_conn_tmp->deferred_tx_queue)) {
+			def_tx_entry =
+				container_of(rxm_conn_tmp->deferred_tx_queue.next,
+					     struct rxm_deferred_tx_entry, entry);
+			FI_DBG(&rxm_prov, FI_LOG_EP_CTRL,
+			       "cancelled deferred message\n");
+			rxm_ep_dequeue_deferred_tx_queue(def_tx_entry);
+			free(def_tx_entry);
+		}
+	}
 
 	FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "closing msg ep\n");
 	if (!rxm_conn->msg_ep)
@@ -288,23 +340,26 @@ static void rxm_conn_free(struct rxm_cmap_handle *handle)
 	free(rxm_conn);
 }
 
-static int rxm_cmap_alloc_handle(struct rxm_cmap *cmap, fi_addr_t fi_addr,
-				 enum rxm_cmap_state state,
-				 struct rxm_cmap_handle **handle)
+int rxm_cmap_alloc_handle(struct rxm_cmap *cmap, fi_addr_t fi_addr,
+			  enum rxm_cmap_state state,
+			  struct rxm_cmap_handle **handle)
 {
 	int ret;
 
 	*handle = rxm_conn_alloc(cmap);
-	if (OFI_UNLIKELY(!*handle))
+	if (!*handle)
 		return -FI_ENOMEM;
+
 	FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL,
 	       "Allocated handle: %p for fi_addr: %" PRIu64 "\n",
 	       *handle, fi_addr);
+
 	ret = rxm_cmap_check_and_realloc_handles_table(cmap, fi_addr);
-	if (OFI_UNLIKELY(ret)) {
+	if (ret) {
 		rxm_conn_free(*handle);
 		return ret;
 	}
+
 	rxm_cmap_init_handle(*handle, cmap, state, fi_addr, NULL);
 	cmap->handles_av[fi_addr] = *handle;
 	return 0;
@@ -319,14 +374,17 @@ static int rxm_cmap_alloc_handle_peer(struct rxm_cmap *cmap, void *addr,
 	peer = calloc(1, sizeof(*peer) + cmap->av->addrlen);
 	if (!peer)
 		return -FI_ENOMEM;
+
 	*handle = rxm_conn_alloc(cmap);
 	if (!*handle) {
 		free(peer);
 		return -FI_ENOMEM;
 	}
-	ofi_straddr_dbg(cmap->av->prov, FI_LOG_AV, "Allocated handle for addr",
-			addr);
+
+	ofi_straddr_dbg(cmap->av->prov, FI_LOG_AV,
+			"Allocated handle for addr", addr);
 	FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, "handle: %p\n", *handle);
+
 	rxm_cmap_init_handle(*handle, cmap, state, FI_ADDR_NOTAVAIL, peer);
 	FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, "Adding handle to peer list\n");
 	peer->handle = *handle;
@@ -345,6 +403,7 @@ rxm_cmap_get_handle_peer(struct rxm_cmap *cmap, const void *addr)
 				       addr);
 	if (!entry)
 		return NULL;
+
 	ofi_straddr_dbg(cmap->av->prov, FI_LOG_AV,
 			"handle found in peer list for addr", addr);
 	peer = container_of(entry, struct rxm_cmap_peer, entry);
@@ -628,6 +687,9 @@ int rxm_cmap_connect(struct rxm_ep *rxm_ep, fi_addr_t fi_addr,
 		ret = rxm_conn_connect(rxm_ep, handle,
 				       ofi_av_get_addr(rxm_ep->cmap->av, fi_addr));
 		if (ret) {
+			if (ret == -FI_ECONNREFUSED)
+				return -FI_EAGAIN;
+
 			rxm_cmap_del_handle(handle);
 		} else {
 			RXM_CM_UPDATE_STATE(handle, RXM_CMAP_CONNREQ_SENT);
@@ -688,11 +750,10 @@ void rxm_cmap_free(struct rxm_cmap *cmap)
 		if (cmap->handles_av[i]) {
 			rxm_cmap_clear_key(cmap->handles_av[i]);
 			rxm_conn_free(cmap->handles_av[i]);
-			cmap->handles_av[i] = 0;
 		}
 	}
 
-	while(!dlist_empty(&cmap->peer_list)) {
+	while (!dlist_empty(&cmap->peer_list)) {
 		entry = cmap->peer_list.next;
 		peer = container_of(entry, struct rxm_cmap_peer, entry);
 		dlist_remove(&peer->entry);
@@ -796,6 +857,7 @@ static int rxm_msg_ep_open(struct rxm_ep *rxm_ep, struct fi_info *msg_info,
 
 	rxm_domain = container_of(rxm_ep->util_ep.domain, struct rxm_domain,
 			util_domain);
+
 	ret = fi_endpoint(rxm_domain->msg_domain, msg_info, &msg_ep, context);
 	if (ret) {
 		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
@@ -834,13 +896,19 @@ static int rxm_msg_ep_open(struct rxm_ep *rxm_ep, struct fi_info *msg_info,
 		goto err;
 	}
 
+	ret = rxm_domain->flow_ctrl_ops->enable(msg_ep);
+	if (!ret) {
+		rxm_domain->flow_ctrl_ops->set_threshold(
+			msg_ep, rxm_ep->msg_info->rx_attr->size / 2);
+	}
+
+	rxm_conn->msg_ep = msg_ep;
+
 	if (!rxm_ep->srx_ctx) {
 		ret = rxm_msg_ep_prepost_recv(rxm_ep, msg_ep);
 		if (ret)
 			goto err;
 	}
-
-	rxm_conn->msg_ep = msg_ep;
 	return 0;
 err:
 	fi_close(&msg_ep->fid);
@@ -877,7 +945,7 @@ static int rxm_conn_reprocess_directed_recvs(struct rxm_recv_queue *recv_queue)
 		rx_buf->recv_entry = container_of(entry, struct rxm_recv_entry,
 						  entry);
 
-		ret = rxm_cq_handle_rx_buf(rx_buf);
+		ret = rxm_handle_rx_buf(rx_buf);
 		if (ret) {
 			err_entry.op_context = rx_buf;
 			err_entry.flags = rx_buf->recv_entry->comp_flags;
@@ -919,9 +987,10 @@ rxm_conn_av_updated_handler(struct rxm_cmap_handle *handle)
 
 static struct rxm_cmap_handle *rxm_conn_alloc(struct rxm_cmap *cmap)
 {
-	struct rxm_conn *rxm_conn = calloc(1, sizeof(*rxm_conn));
+	struct rxm_conn *rxm_conn;
 
-	if (OFI_UNLIKELY(!rxm_conn))
+	rxm_conn = calloc(1, sizeof(*rxm_conn));
+	if (!rxm_conn)
 		return NULL;
 
 	if (rxm_conn_res_alloc(cmap->ep, rxm_conn)) {
@@ -1072,14 +1141,14 @@ static void rxm_flush_msg_cq(struct rxm_ep *rxm_ep)
 	do {
 		ret = fi_cq_read(rxm_ep->msg_cq, &comp, 1);
 		if (ret > 0) {
-			ret = rxm_cq_handle_comp(rxm_ep, &comp);
+			ret = rxm_handle_comp(rxm_ep, &comp);
 			if (OFI_UNLIKELY(ret)) {
 				rxm_cq_write_error_all(rxm_ep, ret);
 			} else {
 				ret = 1;
 			}
 		} else if (ret == -FI_EAVAIL) {
-			rxm_cq_read_write_error(rxm_ep);
+			rxm_handle_comp_error(rxm_ep);
 			ret = 1;
 		} else if (ret < 0 && ret != -FI_EAGAIN) {
 			rxm_cq_write_error_all(rxm_ep, ret);
@@ -1116,7 +1185,7 @@ static int rxm_conn_handle_notify(struct fi_eq_entry *eq_entry)
 		free(handle->peer);
 		handle->peer = NULL;
 	} else {
-		cmap->handles_av[handle->fi_addr] = 0;
+		cmap->handles_av[handle->fi_addr] = NULL;
 	}
 	rxm_conn_free(handle);
 	return 0;
diff --git a/deps/libfabric/prov/rxm/src/rxm_cq.c b/deps/libfabric/prov/rxm/src/rxm_cq.c
index f12c34afc49dceb267616908ffedfedaf6ba8468..32a31e4de07f137d6f17af3bc3599ea5c7077486 100644
--- a/deps/libfabric/prov/rxm/src/rxm_cq.c
+++ b/deps/libfabric/prov/rxm/src/rxm_cq.c
@@ -1,6 +1,7 @@
 /*
- * Copyright (c) 2013-2016 Intel Corporation. All rights reserved.
+ * Copyright (c) 2013-2020 Intel Corporation. All rights reserved.
  * Copyright (c) 2018 Cray Inc. All rights reserved.
+ * Copyright (c) 2018 System Fabric Works, Inc. All rights reserved.
  * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -46,7 +47,8 @@
 size_t rxm_cm_progress_interval;
 size_t rxm_cq_eq_fairness;
 
-static const char *rxm_cq_strerror(struct fid_cq *cq_fid, int prov_errno,
+static const char *
+rxm_cq_strerror(struct fid_cq *cq_fid, int prov_errno,
 		const void *err_data, char *buf, size_t len)
 {
 	struct util_cq *cq;
@@ -60,10 +62,19 @@ static const char *rxm_cq_strerror(struct fid_cq *cq_fid, int prov_errno,
 	return fi_cq_strerror(rxm_ep->msg_cq, prov_errno, err_data, buf, len);
 }
 
-static inline uint64_t
-rxm_cq_get_rx_comp_and_op_flags(struct rxm_rx_buf *rx_buf)
+static int rxm_repost_new_rx(struct rxm_rx_buf *rx_buf)
 {
-	return (rx_buf->pkt.hdr.flags | ofi_rx_flags[rx_buf->pkt.hdr.op]);
+	struct rxm_rx_buf *new_rx_buf;
+	if (rx_buf->repost) {
+		rx_buf->repost = 0;
+		new_rx_buf = rxm_rx_buf_alloc(rx_buf->ep, rx_buf->msg_ep, 1);
+		if (!new_rx_buf)
+			return -FI_ENOMEM;
+
+		dlist_insert_tail(&new_rx_buf->repost_entry,
+				  &new_rx_buf->ep->repost_ready_list);
+	}
+	return FI_SUCCESS;
 }
 
 static int rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf)
@@ -71,23 +82,15 @@ static int rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf)
 	uint64_t flags;
 	char *data;
 
-	if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg &&
+	if ((rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) &&
 	    rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) != RXM_SAR_SEG_FIRST) {
 		dlist_insert_tail(&rx_buf->unexp_msg.entry,
 				  &rx_buf->conn->sar_deferred_rx_msg_list);
-		rx_buf = rxm_rx_buf_alloc(rx_buf->ep, rx_buf->msg_ep, 1);
-		if (OFI_UNLIKELY(!rx_buf)) {
-			FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
-				"ran out of buffers from RX buffer pool\n");
-			return -FI_ENOMEM;
-		}
-		dlist_insert_tail(&rx_buf->repost_entry,
-				  &rx_buf->ep->repost_ready_list);
-
-		return 0;
+		// repost a new buffer immediately while SAR takes some time to complete
+		return rxm_repost_new_rx(rx_buf);
 	}
 
-	flags = rxm_cq_get_rx_comp_and_op_flags(rx_buf);
+	flags = (rx_buf->pkt.hdr.flags | ofi_rx_flags[rx_buf->pkt.hdr.op]);
 
 	if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_eager)
 		flags |= FI_MORE;
@@ -123,12 +126,9 @@ static int rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len)
 				       rx_buf->recv_entry->rxm_iov.iov[0].iov_base,
 				       rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag,
 				       rx_buf->pkt.hdr.size - done_len);
-	if (OFI_UNLIKELY(ret)) {
-		FI_WARN(&rxm_prov, FI_LOG_CQ,
-			"Unable to write recv error CQ\n");
-		return ret;
-	}
-	return 0;
+	if (ret)
+		FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to write recv error CQ\n");
+	return ret;
 }
 
 static int rxm_finish_recv(struct rxm_rx_buf *rx_buf, size_t done_len)
@@ -144,7 +144,8 @@ static int rxm_finish_recv(struct rxm_rx_buf *rx_buf, size_t done_len)
 
 	if (rx_buf->recv_entry->flags & FI_COMPLETION ||
 	    rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV) {
-		ret = rxm_cq_write_recv_comp(rx_buf, rx_buf->recv_entry->context,
+		ret = rxm_cq_write_recv_comp(rx_buf,
+				rx_buf->recv_entry->context,
 				rx_buf->recv_entry->comp_flags |
 				rx_buf->pkt.hdr.flags,
 				rx_buf->pkt.hdr.size,
@@ -160,7 +161,8 @@ static int rxm_finish_recv(struct rxm_rx_buf *rx_buf, size_t done_len)
 		recv_entry->total_len -= recv_size;
 
 		if (recv_entry->total_len < rx_buf->ep->min_multi_recv_size) {
-			ret = ofi_cq_write(rx_buf->ep->util_ep.rx_cq, recv_entry->context,
+			ret = ofi_cq_write(rx_buf->ep->util_ep.rx_cq,
+					   recv_entry->context,
 					   FI_MULTI_RECV, 0, NULL, 0, 0);
 			goto release;
 		}
@@ -183,27 +185,31 @@ free_buf:
 	return ret;
 }
 
-static inline int
-rxm_cq_tx_comp_write(struct rxm_ep *rxm_ep, uint64_t comp_flags,
+static int
+rxm_cq_write_tx_comp(struct rxm_ep *rxm_ep, uint64_t comp_flags,
 		     void *app_context,  uint64_t flags)
 {
+	int ret;
+
 	if (flags & FI_COMPLETION) {
-		int ret = ofi_cq_write(rxm_ep->util_ep.tx_cq, app_context,
-				       comp_flags, 0, NULL, 0, 0);
-		if (OFI_UNLIKELY(ret)) {
+		ret = ofi_cq_write(rxm_ep->util_ep.tx_cq, app_context,
+				   comp_flags, 0, NULL, 0, 0);
+		if (ret) {
 			FI_WARN(&rxm_prov, FI_LOG_CQ,
 				"Unable to report completion\n");
-			return ret;
+		} else {
+			rxm_cq_log_comp(comp_flags);
 		}
-		rxm_cq_log_comp(comp_flags);
+	} else {
+		ret = 0;
 	}
-	return 0;
+	return ret;
 }
 
-static inline int rxm_finish_rma(struct rxm_ep *rxm_ep, struct rxm_rma_buf *rma_buf,
-				 uint64_t comp_flags)
+static int rxm_finish_rma(struct rxm_ep *rxm_ep, struct rxm_rma_buf *rma_buf,
+			  uint64_t comp_flags)
 {
-	int ret = rxm_cq_tx_comp_write(rxm_ep, comp_flags,
+	int ret = rxm_cq_write_tx_comp(rxm_ep, comp_flags,
 				       rma_buf->app_context, rma_buf->flags);
 
 	assert(((comp_flags & FI_WRITE) && !(comp_flags & FI_READ)) ||
@@ -225,7 +231,7 @@ static inline int rxm_finish_rma(struct rxm_ep *rxm_ep, struct rxm_rma_buf *rma_
 
 int rxm_finish_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_eager_buf *tx_buf)
 {
-	int ret = rxm_cq_tx_comp_write(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op),
+	int ret = rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op),
 				       tx_buf->app_context, tx_buf->flags);
 
 	assert(ofi_tx_cq_flags(tx_buf->pkt.hdr.op) & FI_SEND);
@@ -248,7 +254,7 @@ static int rxm_finish_sar_segment_send(struct rxm_ep *rxm_ep,
 		break;
 	case RXM_SAR_SEG_LAST:
 		if (!err) {
-			ret = rxm_cq_tx_comp_write(rxm_ep,
+			ret = rxm_cq_write_tx_comp(rxm_ep,
 					ofi_tx_cq_flags(tx_buf->pkt.hdr.op),
 					tx_buf->app_context, tx_buf->flags);
 
@@ -266,7 +272,7 @@ static int rxm_finish_sar_segment_send(struct rxm_ep *rxm_ep,
 	return ret;
 }
 
-static inline int rxm_finish_send_rndv_ack(struct rxm_rx_buf *rx_buf)
+static int rxm_finish_send_rndv_ack(struct rxm_rx_buf *rx_buf)
 {
 	RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_FINISH);
 
@@ -276,12 +282,14 @@ static inline int rxm_finish_send_rndv_ack(struct rxm_rx_buf *rx_buf)
 	}
 
 	if (!rx_buf->ep->rdm_mr_local)
-		rxm_msg_mr_closev(rx_buf->mr, rx_buf->recv_entry->rxm_iov.count);
+		rxm_msg_mr_closev(rx_buf->mr,
+				  rx_buf->recv_entry->rxm_iov.count);
 
 	return rxm_finish_recv(rx_buf, rx_buf->recv_entry->total_len);
 }
 
-static int rxm_rndv_tx_finish(struct rxm_ep *rxm_ep, struct rxm_tx_rndv_buf *tx_buf)
+static int rxm_rndv_tx_finish(struct rxm_ep *rxm_ep,
+			      struct rxm_tx_rndv_buf *tx_buf)
 {
 	int ret;
 
@@ -290,7 +298,7 @@ static int rxm_rndv_tx_finish(struct rxm_ep *rxm_ep, struct rxm_tx_rndv_buf *tx_
 	if (!rxm_ep->rdm_mr_local)
 		rxm_msg_mr_closev(tx_buf->mr, tx_buf->count);
 
-	ret = rxm_cq_tx_comp_write(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op),
+	ret = rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op),
 				   tx_buf->app_context, tx_buf->flags);
 
 	assert(ofi_tx_cq_flags(tx_buf->pkt.hdr.op) & FI_SEND);
@@ -304,6 +312,7 @@ static int rxm_rndv_tx_finish(struct rxm_ep *rxm_ep, struct rxm_tx_rndv_buf *tx_
 static int rxm_rndv_handle_ack(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf)
 {
 	struct rxm_tx_rndv_buf *tx_buf;
+	int ret;
 
 	tx_buf = ofi_bufpool_get_ibuf(rxm_ep->buf_pools[RXM_BUF_POOL_TX_RNDV].pool,
 				      rx_buf->pkt.ctrl_hdr.msg_id);
@@ -316,34 +325,39 @@ static int rxm_rndv_handle_ack(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf)
 	rxm_rx_buf_free(rx_buf);
 
 	if (tx_buf->hdr.state == RXM_RNDV_ACK_WAIT) {
-		return rxm_rndv_tx_finish(rxm_ep, tx_buf);
+		ret = rxm_rndv_tx_finish(rxm_ep, tx_buf);
 	} else {
 		assert(tx_buf->hdr.state == RXM_RNDV_TX);
 		RXM_UPDATE_STATE(FI_LOG_CQ, tx_buf, RXM_RNDV_ACK_RECVD);
-		return 0;
+		ret = 0;
 	}
+	return ret;
 }
 
 static int rxm_rx_buf_match_msg_id(struct dlist_entry *item, const void *arg)
 {
-	uint64_t msg_id = *((uint64_t *)arg);
-	struct rxm_rx_buf *rx_buf =
-		container_of(item, struct rxm_rx_buf, unexp_msg.entry);
+	uint64_t msg_id = *((uint64_t *) arg);
+	struct rxm_rx_buf *rx_buf;
+
+	rx_buf = container_of(item, struct rxm_rx_buf, unexp_msg.entry);
 	return (msg_id == rx_buf->pkt.ctrl_hdr.msg_id);
 }
 
-static inline
-ssize_t rxm_cq_copy_seg_data(struct rxm_rx_buf *rx_buf, int *done)
+static ssize_t rxm_process_seg_data(struct rxm_rx_buf *rx_buf, int *done)
 {
-	uint64_t done_len = ofi_copy_to_iov(rx_buf->recv_entry->rxm_iov.iov,
-					    rx_buf->recv_entry->rxm_iov.count,
-					    rx_buf->recv_entry->sar.total_recv_len,
-					    rx_buf->pkt.data,
-					    rx_buf->pkt.ctrl_hdr.seg_size);
+	uint64_t done_len;
+	ssize_t ret;
+
+	done_len = ofi_copy_to_iov(rx_buf->recv_entry->rxm_iov.iov,
+				   rx_buf->recv_entry->rxm_iov.count,
+				   rx_buf->recv_entry->sar.total_recv_len,
+				   rx_buf->pkt.data,
+				   rx_buf->pkt.ctrl_hdr.seg_size);
 	rx_buf->recv_entry->sar.total_recv_len += done_len;
 
 	if ((rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST) ||
 	    (done_len != rx_buf->pkt.ctrl_hdr.seg_size)) {
+
 		dlist_remove(&rx_buf->recv_entry->sar.entry);
 
 		/* Mark rxm_recv_entry::msg_id as unknown for futher re-use */
@@ -353,7 +367,7 @@ ssize_t rxm_cq_copy_seg_data(struct rxm_rx_buf *rx_buf, int *done)
 		rx_buf->recv_entry->sar.total_recv_len = 0;
 
 		*done = 1;
-		return rxm_finish_recv(rx_buf, done_len);
+		ret = rxm_finish_recv(rx_buf, done_len);
 	} else {
 		if (rx_buf->recv_entry->sar.msg_id == RXM_SAR_RX_INIT) {
 			if (!rx_buf->conn) {
@@ -373,52 +387,54 @@ ssize_t rxm_cq_copy_seg_data(struct rxm_rx_buf *rx_buf, int *done)
 		rxm_rx_buf_free(rx_buf);
 
 		*done = 0;
-		return FI_SUCCESS;
+		ret = FI_SUCCESS;
 	}
+	return ret;
 }
 
-ssize_t rxm_cq_handle_seg_data(struct rxm_rx_buf *rx_buf)
+static ssize_t rxm_handle_seg_data(struct rxm_rx_buf *rx_buf)
 {
+	struct rxm_recv_entry *recv_entry;
+	struct rxm_conn *conn;
+	uint64_t msg_id;
+	struct dlist_entry *entry;
+	ssize_t ret;
 	int done;
 
-	if (rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV) {
-		struct rxm_recv_entry *recv_entry = rx_buf->recv_entry;
-		struct rxm_conn *conn = rx_buf->conn;
-		uint64_t msg_id = rx_buf->pkt.ctrl_hdr.msg_id;
-		struct dlist_entry *entry;
-		ssize_t ret;
+	ret = rxm_process_seg_data(rx_buf, &done);
+	if (done || !(rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV))
+		return ret;
 
-		ret = rxm_cq_copy_seg_data(rx_buf, &done);
-		if (done)
-			return ret;
+	recv_entry = rx_buf->recv_entry;
+	conn = rx_buf->conn;
+	msg_id = rx_buf->pkt.ctrl_hdr.msg_id;
 
-		dlist_foreach_container_safe(&conn->sar_deferred_rx_msg_list,
-					     struct rxm_rx_buf, rx_buf,
-					     unexp_msg.entry, entry) {
-			if (!rxm_rx_buf_match_msg_id(&rx_buf->unexp_msg.entry, &msg_id))
-				continue;
-			dlist_remove(&rx_buf->unexp_msg.entry);
-			rx_buf->recv_entry = recv_entry;
-			ret = rxm_cq_copy_seg_data(rx_buf, &done);
-			if (done)
-				break;
-		}
-		return ret;
-	} else {
-		return rxm_cq_copy_seg_data(rx_buf, &done);
+	dlist_foreach_container_safe(&conn->sar_deferred_rx_msg_list,
+				     struct rxm_rx_buf, rx_buf,
+				     unexp_msg.entry, entry) {
+		if (!rxm_rx_buf_match_msg_id(&rx_buf->unexp_msg.entry, &msg_id))
+			continue;
+
+		dlist_remove(&rx_buf->unexp_msg.entry);
+		rx_buf->recv_entry = recv_entry;
+		ret = rxm_process_seg_data(rx_buf, &done);
+		if (done)
+			break;
 	}
+	return ret;
 }
 
-static inline ssize_t
-rxm_cq_rndv_read_prepare_deferred(struct rxm_deferred_tx_entry **def_tx_entry, size_t index,
-				 struct iovec *iov, void *desc[RXM_IOV_LIMIT],
-				 size_t count, struct rxm_rx_buf *rx_buf)
+static ssize_t
+rxm_prepare_deferred_rndv_read(struct rxm_deferred_tx_entry **def_tx_entry,
+			       size_t index, struct iovec *iov,
+			       void *desc[RXM_IOV_LIMIT], size_t count,
+			       struct rxm_rx_buf *rx_buf)
 {
 	uint8_t i;
 
 	*def_tx_entry = rxm_ep_alloc_deferred_tx_entry(rx_buf->ep, rx_buf->conn,
 						       RXM_DEFERRED_TX_RNDV_READ);
-	if (OFI_UNLIKELY(!*def_tx_entry))
+	if (!*def_tx_entry)
 		return -FI_ENOMEM;
 
 	(*def_tx_entry)->rndv_read.rx_buf = rx_buf;
@@ -426,6 +442,7 @@ rxm_cq_rndv_read_prepare_deferred(struct rxm_deferred_tx_entry **def_tx_entry, s
 			rx_buf->rndv_hdr->iov[index].addr;
 	(*def_tx_entry)->rndv_read.rma_iov.key =
 			rx_buf->rndv_hdr->iov[index].key;
+
 	for (i = 0; i < count; i++) {
 		(*def_tx_entry)->rndv_read.rxm_iov.iov[i] = iov[i];
 		(*def_tx_entry)->rndv_read.rxm_iov.desc[i] = desc[i];
@@ -435,29 +452,25 @@ rxm_cq_rndv_read_prepare_deferred(struct rxm_deferred_tx_entry **def_tx_entry, s
 	return 0;
 }
 
-ssize_t rxm_cq_handle_rndv(struct rxm_rx_buf *rx_buf)
+static ssize_t rxm_handle_rndv(struct rxm_rx_buf *rx_buf)
 {
 	size_t i, index = 0, offset = 0, count, total_recv_len;
 	struct iovec iov[RXM_IOV_LIMIT];
 	void *desc[RXM_IOV_LIMIT];
-	struct rxm_rx_buf *new_rx_buf;
 	int ret = 0;
 
-	rx_buf->repost = 0;
 
 	/* En-queue new rx buf to be posted ASAP so that we don't block any
-	 * incoming messages. RNDV processing can take a while. */
-	new_rx_buf = rxm_rx_buf_alloc(rx_buf->ep, rx_buf->msg_ep, 1);
-	if (OFI_UNLIKELY(!new_rx_buf))
-		return -FI_ENOMEM;
-	dlist_insert_tail(&new_rx_buf->repost_entry,
-			  &new_rx_buf->ep->repost_ready_list);
+	* incoming messages. RNDV processing can take a while. */
+	ret = rxm_repost_new_rx(rx_buf);
+	if (ret)
+		return ret;
 
 	if (!rx_buf->conn) {
 		assert(rx_buf->ep->srx_ctx);
 		rx_buf->conn = rxm_key2conn(rx_buf->ep,
 					    rx_buf->pkt.ctrl_hdr.conn_id);
-		if (OFI_UNLIKELY(!rx_buf->conn))
+		if (!rx_buf->conn)
 			return -FI_EOTHER;
 	}
 	assert(rx_buf->conn);
@@ -466,7 +479,7 @@ ssize_t rxm_cq_handle_rndv(struct rxm_rx_buf *rx_buf)
 	       "Got incoming recv with msg_id: 0x%" PRIx64 "\n",
 	       rx_buf->pkt.ctrl_hdr.msg_id);
 
-	rx_buf->rndv_hdr = (struct rxm_rndv_hdr *)rx_buf->pkt.data;
+	rx_buf->rndv_hdr = (struct rxm_rndv_hdr *) rx_buf->pkt.data;
 	rx_buf->rndv_rma_index = 0;
 
 	if (!rx_buf->ep->rdm_mr_local) {
@@ -476,17 +489,19 @@ ssize_t rxm_cq_handle_rndv(struct rxm_rx_buf *rx_buf)
 				      rx_buf->recv_entry->rxm_iov.iov,
 				      rx_buf->recv_entry->rxm_iov.count,
 				      total_recv_len, FI_READ, rx_buf->mr);
-		if (OFI_UNLIKELY(ret))
+		if (ret)
 			return ret;
 
 		for (i = 0; (i < rx_buf->recv_entry->rxm_iov.count &&
-			     rx_buf->mr[i]); i++)
+			     rx_buf->mr[i]); i++) {
 			rx_buf->recv_entry->rxm_iov.desc[i] =
 						fi_mr_desc(rx_buf->mr[i]);
+		}
 	} else {
-		for (i = 0; i < rx_buf->recv_entry->rxm_iov.count; i++)
+		for (i = 0; i < rx_buf->recv_entry->rxm_iov.count; i++) {
 			rx_buf->recv_entry->rxm_iov.desc[i] =
 				fi_mr_desc(rx_buf->recv_entry->rxm_iov.desc[i]);
+		}
 		total_recv_len = MIN(rx_buf->recv_entry->total_len,
 				     rx_buf->pkt.hdr.size);
 	}
@@ -507,18 +522,18 @@ ssize_t rxm_cq_handle_rndv(struct rxm_rx_buf *rx_buf)
 					&index, &offset, copy_len);
 		if (ret) {
 			assert(ret == -FI_ETOOSMALL);
-			return rxm_cq_write_error_trunc(
-				rx_buf, rx_buf->recv_entry->total_len);
+			return rxm_cq_write_error_trunc(rx_buf, rx_buf->
+							recv_entry->total_len);
 		}
 		total_recv_len -= copy_len;
 		ret = fi_readv(rx_buf->conn->msg_ep, iov, desc, count, 0,
 			       rx_buf->rndv_hdr->iov[i].addr,
 			       rx_buf->rndv_hdr->iov[i].key, rx_buf);
-		if (OFI_UNLIKELY(ret)) {
-			if (OFI_LIKELY(ret == -FI_EAGAIN)) {
+		if (ret) {
+			if (ret == -FI_EAGAIN) {
 				struct rxm_deferred_tx_entry *def_tx_entry;
 
-				ret = rxm_cq_rndv_read_prepare_deferred(
+				ret = rxm_prepare_deferred_rndv_read(
 						&def_tx_entry, i, iov, desc,
 						count, rx_buf);
 				if (ret)
@@ -537,41 +552,46 @@ readv_err:
 	return ret;
 }
 
-ssize_t rxm_cq_handle_eager(struct rxm_rx_buf *rx_buf)
+ssize_t rxm_handle_eager(struct rxm_rx_buf *rx_buf)
 {
-	uint64_t done_len = ofi_copy_to_iov(rx_buf->recv_entry->rxm_iov.iov,
-					    rx_buf->recv_entry->rxm_iov.count,
-					    0, rx_buf->pkt.data,
-					    rx_buf->pkt.hdr.size);
+	uint64_t done_len;
+
+	done_len = ofi_copy_to_iov(rx_buf->recv_entry->rxm_iov.iov,
+				   rx_buf->recv_entry->rxm_iov.count,
+				   0, rx_buf->pkt.data, rx_buf->pkt.hdr.size);
 	return rxm_finish_recv(rx_buf, done_len);
 }
 
-ssize_t rxm_cq_handle_coll_eager(struct rxm_rx_buf *rx_buf)
+ssize_t rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf)
 {
-	uint64_t done_len = ofi_copy_to_iov(rx_buf->recv_entry->rxm_iov.iov,
-					    rx_buf->recv_entry->rxm_iov.count,
-					    0, rx_buf->pkt.data,
-					    rx_buf->pkt.hdr.size);
+	uint64_t done_len;
+	ssize_t ret;
+
+	done_len = ofi_copy_to_iov(rx_buf->recv_entry->rxm_iov.iov,
+				   rx_buf->recv_entry->rxm_iov.count,
+				   0, rx_buf->pkt.data, rx_buf->pkt.hdr.size);
 	if (rx_buf->pkt.hdr.tag & OFI_COLL_TAG_FLAG) {
 		ofi_coll_handle_xfer_comp(rx_buf->pkt.hdr.tag,
 				rx_buf->recv_entry->context);
 		rxm_rx_buf_free(rx_buf);
 		rxm_recv_entry_release(rx_buf->recv_entry->recv_queue,
 				rx_buf->recv_entry);
-		return FI_SUCCESS;
+		ret = FI_SUCCESS;
+	} else {
+		ret = rxm_finish_recv(rx_buf, done_len);
 	}
-	return rxm_finish_recv(rx_buf, done_len);
+	return ret;
 }
 
-ssize_t rxm_cq_handle_rx_buf(struct rxm_rx_buf *rx_buf)
+ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf)
 {
 	switch (rx_buf->pkt.ctrl_hdr.type) {
 	case rxm_ctrl_eager:
-		return rx_buf->ep->txrx_ops->handle_eager_rx(rx_buf);
+		return rx_buf->ep->eager_ops->handle_rx(rx_buf);
 	case rxm_ctrl_rndv:
-		return rx_buf->ep->txrx_ops->handle_rndv_rx(rx_buf);
+		return rxm_handle_rndv(rx_buf);
 	case rxm_ctrl_seg:
-		return rx_buf->ep->txrx_ops->handle_seg_data_rx(rx_buf);
+		return rxm_handle_seg_data(rx_buf);
 	default:
 		FI_WARN(&rxm_prov, FI_LOG_CQ, "Unknown message type\n");
 		assert(0);
@@ -579,50 +599,35 @@ ssize_t rxm_cq_handle_rx_buf(struct rxm_rx_buf *rx_buf)
 	}
 }
 
-static inline ssize_t
-rxm_cq_match_rx_buf(struct rxm_rx_buf *rx_buf,
-		    struct rxm_recv_queue *recv_queue,
+static ssize_t
+rxm_match_rx_buf(struct rxm_rx_buf *rx_buf,
+		 struct rxm_recv_queue *recv_queue,
 		    struct rxm_recv_match_attr *match_attr)
 {
 	struct dlist_entry *entry;
-	struct rxm_ep *rxm_ep;
-	struct fid_ep *msg_ep;
 
 	entry = dlist_remove_first_match(&recv_queue->recv_list,
 					 recv_queue->match_recv, match_attr);
-	if (!entry) {
-		RXM_DBG_ADDR_TAG(FI_LOG_CQ, "No matching recv found for "
-				 "incoming msg", match_attr->addr,
-				 match_attr->tag);
-		FI_DBG(&rxm_prov, FI_LOG_CQ, "Enqueueing msg to unexpected msg"
-		       "queue\n");
-		rx_buf->unexp_msg.addr = match_attr->addr;
-		rx_buf->unexp_msg.tag = match_attr->tag;
-		rx_buf->repost = 0;
-
-		dlist_insert_tail(&rx_buf->unexp_msg.entry,
-				  &recv_queue->unexp_msg_list);
+	if (entry) {
+		rx_buf->recv_entry = container_of(entry, struct rxm_recv_entry, entry);
+		return rxm_handle_rx_buf(rx_buf);
+	}
 
-		msg_ep = rx_buf->msg_ep;
-		rxm_ep = rx_buf->ep;
+	RXM_DBG_ADDR_TAG(FI_LOG_CQ, "No matching recv found for incoming msg",
+			 match_attr->addr, match_attr->tag);
+	FI_DBG(&rxm_prov, FI_LOG_CQ, "Enqueueing msg to unexpected msg queue\n");
+	rx_buf->unexp_msg.addr = match_attr->addr;
+	rx_buf->unexp_msg.tag = match_attr->tag;
 
-		rx_buf = rxm_rx_buf_alloc(rxm_ep, msg_ep, 1);
-		if (OFI_UNLIKELY(!rx_buf)) {
-			FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
-				"ran out of buffers from RX buffer pool\n");
-			return -FI_ENOMEM;
-		}
+	dlist_insert_tail(&rx_buf->unexp_msg.entry,
+			  &recv_queue->unexp_msg_list);
 
-		dlist_insert_tail(&rx_buf->repost_entry,
-				  &rxm_ep->repost_ready_list);
-		return 0;
-	}
-
-	rx_buf->recv_entry = container_of(entry, struct rxm_recv_entry, entry);
-	return rxm_cq_handle_rx_buf(rx_buf);
+	// repost a new buffer now since we don't know when the unexpected
+	// buffer will be consumed
+	return rxm_repost_new_rx(rx_buf);
 }
 
-static inline ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf)
+static ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf)
 {
 	struct rxm_recv_match_attr match_attr = {
 		.addr = FI_ADDR_UNSPEC,
@@ -630,9 +635,9 @@ static inline ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf)
 
 	if (rx_buf->ep->rxm_info->caps & (FI_SOURCE | FI_DIRECTED_RECV)) {
 		if (rx_buf->ep->srx_ctx)
-			rx_buf->conn =
-				rxm_key2conn(rx_buf->ep, rx_buf->pkt.ctrl_hdr.conn_id);
-		if (OFI_UNLIKELY(!rx_buf->conn))
+			rx_buf->conn = rxm_key2conn(rx_buf->ep, rx_buf->
+						    pkt.ctrl_hdr.conn_id);
+		if (!rx_buf->conn)
 			return -FI_EOTHER;
 		match_attr.addr = rx_buf->conn->handle.fi_addr;
 	}
@@ -643,13 +648,13 @@ static inline ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf)
 	switch(rx_buf->pkt.hdr.op) {
 	case ofi_op_msg:
 		FI_DBG(&rxm_prov, FI_LOG_CQ, "Got MSG op\n");
-		return rxm_cq_match_rx_buf(rx_buf, &rx_buf->ep->recv_queue,
-					   &match_attr);
+		return rxm_match_rx_buf(rx_buf, &rx_buf->ep->recv_queue,
+					&match_attr);
 	case ofi_op_tagged:
 		FI_DBG(&rxm_prov, FI_LOG_CQ, "Got TAGGED op\n");
 		match_attr.tag = rx_buf->pkt.hdr.tag;
-		return rxm_cq_match_rx_buf(rx_buf, &rx_buf->ep->trecv_queue,
-					   &match_attr);
+		return rxm_match_rx_buf(rx_buf, &rx_buf->ep->trecv_queue,
+					&match_attr);
 	default:
 		FI_WARN(&rxm_prov, FI_LOG_CQ, "Unknown op!\n");
 		assert(0);
@@ -659,32 +664,34 @@ static inline ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf)
 
 static int rxm_sar_match_msg_id(struct dlist_entry *item, const void *arg)
 {
-	uint64_t msg_id = *((uint64_t *)arg);
-	struct rxm_recv_entry *recv_entry =
-		container_of(item, struct rxm_recv_entry, sar.entry);
+	uint64_t msg_id = *((uint64_t *) arg);
+	struct rxm_recv_entry *recv_entry;
+
+	recv_entry = container_of(item, struct rxm_recv_entry, sar.entry);
 	return (msg_id == recv_entry->sar.msg_id);
 }
 
-static inline
-ssize_t rxm_sar_handle_segment(struct rxm_rx_buf *rx_buf)
+static ssize_t rxm_sar_handle_segment(struct rxm_rx_buf *rx_buf)
 {
 	struct dlist_entry *sar_entry;
 
 	rx_buf->conn = rxm_key2conn(rx_buf->ep,
 				    rx_buf->pkt.ctrl_hdr.conn_id);
-	if (OFI_UNLIKELY(!rx_buf->conn))
+	if (!rx_buf->conn)
 		return -FI_EOTHER;
+
 	FI_DBG(&rxm_prov, FI_LOG_CQ,
-	       "Got incoming recv with msg_id: 0x%" PRIx64 "for conn - %p\n",
+	       "Got incoming recv with msg_id: 0x%" PRIx64 " for conn - %p\n",
 	       rx_buf->pkt.ctrl_hdr.msg_id, rx_buf->conn);
 	sar_entry = dlist_find_first_match(&rx_buf->conn->sar_rx_msg_list,
 					   rxm_sar_match_msg_id,
 					   &rx_buf->pkt.ctrl_hdr.msg_id);
 	if (!sar_entry)
 		return rxm_handle_recv_comp(rx_buf);
-	rx_buf->recv_entry =
-		container_of(sar_entry, struct rxm_recv_entry, sar.entry);
-	return rx_buf->ep->txrx_ops->handle_seg_data_rx(rx_buf);
+
+	rx_buf->recv_entry = container_of(sar_entry, struct rxm_recv_entry,
+					  sar.entry);
+	return rxm_handle_seg_data(rx_buf);
 }
 
 static ssize_t rxm_rndv_send_ack_inject(struct rxm_rx_buf *rx_buf)
@@ -712,6 +719,7 @@ static ssize_t rxm_rndv_send_ack_inject(struct rxm_rx_buf *rx_buf)
 
 static ssize_t rxm_rndv_send_ack(struct rxm_rx_buf *rx_buf)
 {
+	struct rxm_deferred_tx_entry *def_tx_entry;
 	ssize_t ret;
 
 	assert(rx_buf->conn);
@@ -721,39 +729,38 @@ static ssize_t rxm_rndv_send_ack(struct rxm_rx_buf *rx_buf)
 		if (!ret)
 			goto out;
 
-		if (OFI_UNLIKELY(ret != -FI_EAGAIN)) {
+		if (ret != -FI_EAGAIN) {
 			FI_WARN(&rxm_prov, FI_LOG_CQ,
 				"send ack via inject failed for MSG provider\n");
 			return ret;
 		}
 	}
 
-	rx_buf->recv_entry->rndv.tx_buf = (struct rxm_tx_base_buf *)
-		rxm_tx_buf_alloc(rx_buf->ep, RXM_BUF_POOL_TX_ACK);
-	if (OFI_UNLIKELY(!rx_buf->recv_entry->rndv.tx_buf)) {
+	rx_buf->recv_entry->rndv.tx_buf = rxm_tx_buf_alloc(rx_buf->ep,
+							   RXM_BUF_POOL_TX_ACK);
+	if (!rx_buf->recv_entry->rndv.tx_buf) {
 		FI_WARN(&rxm_prov, FI_LOG_CQ,
 			"ran out of buffers from ACK buffer pool\n");
 		return -FI_EAGAIN;
 	}
-	assert(rx_buf->recv_entry->rndv.tx_buf->pkt.ctrl_hdr.type == rxm_ctrl_rndv_ack);
+	assert(rx_buf->recv_entry->rndv.tx_buf->pkt.ctrl_hdr.type ==
+	       rxm_ctrl_rndv_ack);
 
 	assert(rx_buf->hdr.state == RXM_RNDV_READ);
 
-	rx_buf->recv_entry->rndv.tx_buf->pkt.ctrl_hdr.conn_id =
-		rx_buf->conn->handle.remote_key;
-	rx_buf->recv_entry->rndv.tx_buf->pkt.ctrl_hdr.msg_id =
-		rx_buf->pkt.ctrl_hdr.msg_id;
+	rx_buf->recv_entry->rndv.tx_buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->
+								handle.remote_key;
+	rx_buf->recv_entry->rndv.tx_buf->pkt.ctrl_hdr.msg_id = rx_buf->pkt.
+							       ctrl_hdr.msg_id;
 
 	ret = fi_send(rx_buf->conn->msg_ep, &rx_buf->recv_entry->rndv.tx_buf->pkt,
 		      sizeof(rx_buf->recv_entry->rndv.tx_buf->pkt),
 		      rx_buf->recv_entry->rndv.tx_buf->hdr.desc, 0, rx_buf);
-	if (OFI_UNLIKELY(ret)) {
-		if (OFI_LIKELY(ret == -FI_EAGAIN)) {
-			struct rxm_deferred_tx_entry *def_tx_entry =
-				rxm_ep_alloc_deferred_tx_entry(
-					rx_buf->ep, rx_buf->conn,
-					RXM_DEFERRED_TX_RNDV_ACK);
-			if (OFI_UNLIKELY(!def_tx_entry)) {
+	if (ret) {
+		if (ret == -FI_EAGAIN) {
+			def_tx_entry = rxm_ep_alloc_deferred_tx_entry(rx_buf->ep,
+					rx_buf->conn, RXM_DEFERRED_TX_RNDV_ACK);
+			if (!def_tx_entry) {
 				FI_WARN(&rxm_prov, FI_LOG_CQ, "unable to "
 					"allocate TX entry for deferred ACK\n");
 				ret = -FI_EAGAIN;
@@ -798,10 +805,11 @@ static int rxm_handle_remote_write(struct rxm_ep *rxm_ep,
 	return 0;
 }
 
-static inline void rxm_ep_format_atomic_resp_pkt_hdr(struct rxm_conn *rxm_conn,
-				struct rxm_tx_atomic_buf *tx_buf,
-				size_t data_len, uint32_t pkt_op,
-				enum fi_datatype datatype, uint8_t atomic_op)
+static void rxm_format_atomic_resp_pkt_hdr(struct rxm_conn *rxm_conn,
+					   struct rxm_tx_atomic_buf *tx_buf,
+					   size_t data_len, uint32_t pkt_op,
+					   enum fi_datatype datatype,
+					   uint8_t atomic_op)
 {
 	rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, pkt_op, 0, 0, 0,
 				 &tx_buf->pkt);
@@ -820,16 +828,16 @@ static ssize_t rxm_atomic_send_resp(struct rxm_ep *rxm_ep,
 	struct rxm_deferred_tx_entry *def_tx_entry;
 	struct rxm_atomic_resp_hdr *atomic_hdr;
 	ssize_t ret;
-	ssize_t resp_len = result_len + sizeof(struct rxm_atomic_resp_hdr) +
-				sizeof(struct rxm_pkt);
+	ssize_t resp_len;
+
+	resp_len = result_len + sizeof(struct rxm_atomic_resp_hdr) +
+		   sizeof(struct rxm_pkt);
 
 	resp_buf->hdr.state = RXM_ATOMIC_RESP_SENT;
-	rxm_ep_format_atomic_resp_pkt_hdr(rx_buf->conn,
-					  resp_buf,
-					  resp_len,
-					  rx_buf->pkt.hdr.op,
-					  rx_buf->pkt.hdr.atomic.datatype,
-					  rx_buf->pkt.hdr.atomic.op);
+	rxm_format_atomic_resp_pkt_hdr(rx_buf->conn, resp_buf, resp_len,
+				       rx_buf->pkt.hdr.op,
+				       rx_buf->pkt.hdr.atomic.datatype,
+				       rx_buf->pkt.hdr.atomic.op);
 	resp_buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->handle.remote_key;
 	resp_buf->pkt.ctrl_hdr.msg_id = rx_buf->pkt.ctrl_hdr.msg_id;
 	atomic_hdr = (struct rxm_atomic_resp_hdr *) resp_buf->pkt.data;
@@ -839,21 +847,20 @@ static ssize_t rxm_atomic_send_resp(struct rxm_ep *rxm_ep,
 	if (resp_len < rxm_ep->inject_limit) {
 		ret = fi_inject(rx_buf->conn->msg_ep, &resp_buf->pkt,
 				resp_len, 0);
-		if (OFI_LIKELY(!ret))
+		if (!ret)
 			ofi_buf_free(resp_buf);
 	} else {
 		ret = rxm_atomic_send_respmsg(rxm_ep, rx_buf->conn, resp_buf,
 					      resp_len);
 	}
-	if (OFI_UNLIKELY(ret)) {
+	if (ret) {
 		FI_WARN(&rxm_prov, FI_LOG_CQ,
 			"Unable to send Atomic Response\n");
-		if (OFI_LIKELY(ret == -FI_EAGAIN)) {
-			def_tx_entry =
-				rxm_ep_alloc_deferred_tx_entry(rxm_ep,
+		if (ret == -FI_EAGAIN) {
+			def_tx_entry = rxm_ep_alloc_deferred_tx_entry(rxm_ep,
 						rx_buf->conn,
 						RXM_DEFERRED_TX_ATOMIC_RESP);
-			if (OFI_UNLIKELY(!def_tx_entry)) {
+			if (!def_tx_entry) {
 				FI_WARN(&rxm_prov, FI_LOG_CQ,
 					"Unable to allocate deferred Atomic "
 					"Response\n");
@@ -871,23 +878,22 @@ static ssize_t rxm_atomic_send_resp(struct rxm_ep *rxm_ep,
 	return ret;
 }
 
-static inline void rxm_do_atomic(struct rxm_pkt *pkt, void *dst, void *src,
-				 void *cmp, void *res, size_t count,
-				 enum fi_datatype datatype, enum fi_op op)
+static void rxm_do_atomic(struct rxm_pkt *pkt, void *dst, void *src,
+			  void *cmp, void *res, size_t count,
+			  enum fi_datatype datatype, enum fi_op op)
 {
 	switch (pkt->hdr.op) {
 	case ofi_op_atomic:
-		ofi_atomic_write_handlers[op][datatype](dst, src, count);
+		assert(ofi_atomic_iswrite_op(op));
+		ofi_atomic_write_handler(op, datatype, dst, src, count);
 		break;
 	case ofi_op_atomic_fetch:
-		ofi_atomic_readwrite_handlers[op][datatype](dst, src, res,
-							    count);
+		assert(ofi_atomic_isreadwrite_op(op));
+		ofi_atomic_readwrite_handler(op, datatype, dst, src, res, count);
 		break;
 	case ofi_op_atomic_compare:
-		assert(op >= OFI_SWAP_OP_START &&
-		       op < OFI_SWAP_OP_START + OFI_SWAP_OP_LAST);
-		ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][datatype](dst,
-						src, cmp, res, count);
+		assert(ofi_atomic_isswap_op(op));
+		ofi_atomic_swap_handler(op, datatype, dst, src, cmp, res, count);
 		break;
 	default:
 		/* Validated prior to calling function */
@@ -895,8 +901,8 @@ static inline void rxm_do_atomic(struct rxm_pkt *pkt, void *dst, void *src,
 	}
 }
 
-static inline ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep,
-					    struct rxm_rx_buf *rx_buf)
+static ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep,
+				     struct rxm_rx_buf *rx_buf)
 {
 	struct rxm_atomic_hdr *req_hdr =
 			(struct rxm_atomic_hdr *) rx_buf->pkt.data;
@@ -922,12 +928,11 @@ static inline ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep,
 	if (rx_buf->ep->srx_ctx)
 		rx_buf->conn = rxm_key2conn(rx_buf->ep,
 					    rx_buf->pkt.ctrl_hdr.conn_id);
-	if (OFI_UNLIKELY(!rx_buf->conn))
+	if (!rx_buf->conn)
 		return -FI_EOTHER;
 
-	resp_buf = (struct rxm_tx_atomic_buf *)
-		   rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC);
-	if (OFI_UNLIKELY(!resp_buf)) {
+	resp_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC);
+	if (!resp_buf) {
 		FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
 			"Unable to allocate from Atomic buffer pool\n");
 		/* TODO: Should this be -FI_ENOMEM - how does it get
@@ -945,8 +950,8 @@ static inline ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep,
 		if (ret) {
 			FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
 				"Atomic RMA MR verify error %d\n", ret);
-			ret = -FI_EACCES;
-			goto send_nak;
+			return rxm_atomic_send_resp(rxm_ep, rx_buf, resp_buf, 0,
+						    -FI_EACCES);
 		}
 	}
 
@@ -972,20 +977,18 @@ static inline ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep,
 
 	return rxm_atomic_send_resp(rxm_ep, rx_buf, resp_buf,
 				    result_len, FI_SUCCESS);
-send_nak:
-	return rxm_atomic_send_resp(rxm_ep, rx_buf, resp_buf, 0, ret);
 }
 
 
-static inline ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep,
-					     struct rxm_rx_buf *rx_buf)
+static ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep,
+				      struct rxm_rx_buf *rx_buf)
 {
 	struct rxm_tx_atomic_buf *tx_buf;
-	struct rxm_atomic_resp_hdr *resp_hdr =
-			(struct rxm_atomic_resp_hdr *) rx_buf->pkt.data;
+	struct rxm_atomic_resp_hdr *resp_hdr;
 	uint64_t len;
 	int ret = 0;
 
+	resp_hdr = (struct rxm_atomic_resp_hdr *) rx_buf->pkt.data;
 	tx_buf = ofi_bufpool_get_ibuf(rxm_ep->buf_pools[RXM_BUF_POOL_TX_ATOMIC].pool,
 				      rx_buf->pkt.ctrl_hdr.msg_id);
 	FI_DBG(&rxm_prov, FI_LOG_CQ, "received atomic response: op: %" PRIu8
@@ -994,7 +997,7 @@ static inline ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep,
 
 	assert(!(rx_buf->comp_flags & ~(FI_RECV | FI_REMOTE_CQ_DATA)));
 
-	if (OFI_UNLIKELY(resp_hdr->status)) {
+	if (resp_hdr->status) {
 		struct util_cntr *cntr = NULL;
 		FI_WARN(&rxm_prov, FI_LOG_CQ,
 		       "bad atomic response status %d\n", ntohl(resp_hdr->status));
@@ -1011,7 +1014,7 @@ static inline ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep,
 		}
 		rxm_cq_write_error(rxm_ep->util_ep.tx_cq, cntr,
 				   tx_buf->app_context, ntohl(resp_hdr->status));
-		goto err;
+		goto free;
 	}
 
 	len = ofi_total_iov_len(tx_buf->result_iov, tx_buf->result_iov_count);
@@ -1020,7 +1023,7 @@ static inline ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep,
 			resp_hdr->data, len);
 
 	if (!(tx_buf->flags & FI_INJECT))
-		ret = rxm_cq_tx_comp_write(rxm_ep,
+		ret = rxm_cq_write_tx_comp(rxm_ep,
 					   ofi_tx_cq_flags(tx_buf->pkt.hdr.op),
 					   tx_buf->app_context, tx_buf->flags);
 
@@ -1035,17 +1038,28 @@ static inline ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep,
 				   tx_buf->app_context, ntohl(resp_hdr->status));
 		assert(0);
 	}
-err:
+free:
 	rxm_rx_buf_free(rx_buf);
 	ofi_buf_free(tx_buf);
 	ofi_atomic_inc32(&rxm_ep->atomic_tx_credits);
 	assert(ofi_atomic_get32(&rxm_ep->atomic_tx_credits) <=
-				rxm_ep->rxm_info->tx_attr->size);
-
+	       rxm_ep->rxm_info->tx_attr->size);
 	return ret;
 }
 
-int rxm_finish_coll_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_eager_buf *tx_eager_buf)
+static ssize_t rxm_handle_credit(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf)
+{
+	struct rxm_domain *domain = container_of(rxm_ep->util_ep.domain,
+						 struct rxm_domain, util_domain);
+
+	domain->flow_ctrl_ops->add_credits(rx_buf->msg_ep,
+					   rx_buf->pkt.ctrl_hdr.ctrl_data);
+	rxm_rx_buf_free(rx_buf);
+	return FI_SUCCESS;
+}
+
+int rxm_finish_coll_eager_send(struct rxm_ep *rxm_ep,
+			       struct rxm_tx_eager_buf *tx_eager_buf)
 {
 	int ret;
 
@@ -1060,15 +1074,16 @@ int rxm_finish_coll_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_eager_buf *t
 	return ret;
 };
 
-ssize_t rxm_cq_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp)
+ssize_t rxm_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp)
 {
-	ssize_t ret;
 	struct rxm_rx_buf *rx_buf;
+	struct rxm_tx_base_buf *tx_buf;
 	struct rxm_tx_sar_buf *tx_sar_buf;
 	struct rxm_tx_eager_buf *tx_eager_buf;
 	struct rxm_tx_rndv_buf *tx_rndv_buf;
 	struct rxm_tx_atomic_buf *tx_atomic_buf;
 	struct rxm_rma_buf *rma_buf;
+	ssize_t ret;
 
 	/* Remote write events may not consume a posted recv so op context
 	 * and hence state would be NULL */
@@ -1078,9 +1093,14 @@ ssize_t rxm_cq_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp)
 	switch (RXM_GET_PROTO_STATE(comp->op_context)) {
 	case RXM_TX:
 		tx_eager_buf = comp->op_context;
-		ret = rxm_ep->txrx_ops->comp_eager_tx(rxm_ep, tx_eager_buf);
+		ret = rxm_ep->eager_ops->comp_tx(rxm_ep, tx_eager_buf);
 		ofi_buf_free(tx_eager_buf);
 		return ret;
+	case RXM_CREDIT_TX:
+		tx_buf = comp->op_context;
+		assert(comp->flags & FI_SEND);
+		ofi_buf_free(tx_buf);
+		return 0;
 	case RXM_INJECT_TX:
 		assert(0);
 		return -FI_EOPBADSTATE;
@@ -1107,6 +1127,8 @@ ssize_t rxm_cq_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp)
 			return rxm_handle_atomic_req(rxm_ep, rx_buf);
 		case rxm_ctrl_atomic_resp:
 			return rxm_handle_atomic_resp(rxm_ep, rx_buf);
+		case rxm_ctrl_credit:
+			return rxm_handle_credit(rxm_ep, rx_buf);
 		default:
 			FI_WARN(&rxm_prov, FI_LOG_CQ, "Unknown message type\n");
 			assert(0);
@@ -1167,6 +1189,7 @@ void rxm_cq_write_error(struct util_cq *cq, struct util_cntr *cntr,
 
 	if (cntr)
 		rxm_cntr_incerr(cntr);
+
 	if (ofi_cq_write_error(cq, &err_entry)) {
 		FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n");
 		assert(0);
@@ -1209,8 +1232,9 @@ void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err)
 		rxm_cntr_incerr(rxm_ep->util_ep.rd_cntr);
 }
 
-void rxm_cq_read_write_error(struct rxm_ep *rxm_ep)
+void rxm_handle_comp_error(struct rxm_ep *rxm_ep)
 {
+	struct rxm_tx_base_buf *base_buf;
 	struct rxm_tx_eager_buf *eager_buf;
 	struct rxm_tx_sar_buf *sar_buf;
 	struct rxm_tx_rndv_buf *rndv_buf;
@@ -1249,7 +1273,7 @@ void rxm_cq_read_write_error(struct rxm_ep *rxm_ep)
 	case RXM_RMA:
 		rma_buf = err_entry.op_context;
 		err_entry.op_context = rma_buf->app_context;
-		err_entry.flags = err_entry.flags;
+		/* err_entry.flags pass through from msg ep */
 		if (!(rma_buf->flags & FI_INJECT) && !rxm_ep->rdm_mr_local &&
 		    rxm_ep->msg_mr_local) {
 			rxm_msg_mr_closev(rma_buf->mr.mr, rma_buf->mr.count);
@@ -1262,6 +1286,11 @@ void rxm_cq_read_write_error(struct rxm_ep *rxm_ep)
 		err_entry.flags = ofi_tx_cq_flags(sar_buf->pkt.hdr.op);
 		rxm_finish_sar_segment_send(rxm_ep, sar_buf, true);
 		break;
+	case RXM_CREDIT_TX:
+		base_buf = err_entry.op_context;
+		err_entry.op_context = 0;
+		err_entry.flags = ofi_tx_cq_flags(base_buf->pkt.hdr.op);
+		break;
 	case RXM_RNDV_TX:
 		rndv_buf = err_entry.op_context;
 		err_entry.op_context = rndv_buf->app_context;
@@ -1308,24 +1337,23 @@ void rxm_cq_read_write_error(struct rxm_ep *rxm_ep)
 		FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n");
 }
 
-static inline int rxm_msg_ep_recv(struct rxm_rx_buf *rx_buf)
+static int rxm_msg_ep_recv(struct rxm_rx_buf *rx_buf)
 {
-	int ret;
+	int ret, level;
 
 	if (rx_buf->ep->srx_ctx)
 		rx_buf->conn = NULL;
 	rx_buf->hdr.state = RXM_RX;
 
-	ret = (int)fi_recv(rx_buf->msg_ep, &rx_buf->pkt,
-			   rxm_eager_limit + sizeof(struct rxm_pkt),
-			   rx_buf->hdr.desc, FI_ADDR_UNSPEC, rx_buf);
-	if (OFI_LIKELY(!ret))
+	ret = (int) fi_recv(rx_buf->msg_ep, &rx_buf->pkt,
+			    rxm_eager_limit + sizeof(struct rxm_pkt),
+			    rx_buf->hdr.desc, FI_ADDR_UNSPEC, rx_buf);
+	if (!ret)
 		return 0;
 
 	if (ret != -FI_EAGAIN) {
-		int level = FI_LOG_WARN;
-		if (rx_buf->conn->handle.state == RXM_CMAP_SHUTDOWN)
-			level = FI_LOG_DEBUG;
+		level = (rx_buf->conn->handle.state == RXM_CMAP_SHUTDOWN) ?
+			FI_LOG_DEBUG : FI_LOG_WARN;
 		FI_LOG(&rxm_prov, level, FI_LOG_EP_CTRL,
 		       "unable to post recv buf: %d\n", ret);
 	}
@@ -1340,11 +1368,11 @@ int rxm_msg_ep_prepost_recv(struct rxm_ep *rxm_ep, struct fid_ep *msg_ep)
 
 	for (i = 0; i < rxm_ep->msg_info->rx_attr->size; i++) {
 		rx_buf = rxm_rx_buf_alloc(rxm_ep, msg_ep, 1);
-		if (OFI_UNLIKELY(!rx_buf))
+		if (!rx_buf)
 			return -FI_ENOMEM;
 
 		ret = rxm_msg_ep_recv(rx_buf);
-		if (OFI_UNLIKELY(ret)) {
+		if (ret) {
 			ofi_buf_free(&rx_buf->hdr);
 			return ret;
 		}
@@ -1375,26 +1403,25 @@ void rxm_ep_do_progress(struct util_ep *util_ep)
 
 		ret = rxm_msg_ep_recv(buf);
 		if (ret) {
-			if (OFI_LIKELY(ret == -FI_EAGAIN))
+			if (ret == -FI_EAGAIN)
 				ofi_buf_free(&buf->hdr);
 		}
 	}
 
 	do {
-
 		ret = fi_cq_read(rxm_ep->msg_cq, &comp, 1);
 		if (ret > 0) {
-			// We don't have enough info to write a good
-			// error entry to the CQ at this point
-			ret = rxm_cq_handle_comp(rxm_ep, &comp);
-			if (OFI_UNLIKELY(ret)) {
+			ret = rxm_handle_comp(rxm_ep, &comp);
+			if (ret) {
+				// We don't have enough info to write a good
+				// error entry to the CQ at this point
 				rxm_cq_write_error_all(rxm_ep, ret);
 			} else {
 				ret = 1;
 			}
 		} else if (ret < 0 && (ret != -FI_EAGAIN)) {
 			if (ret == -FI_EAVAIL)
-				rxm_cq_read_write_error(rxm_ep);
+				rxm_handle_comp_error(rxm_ep);
 			else
 				rxm_cq_write_error_all(rxm_ep, ret);
 		}
@@ -1410,11 +1437,12 @@ void rxm_ep_do_progress(struct util_ep *util_ep)
 		}
 	} while ((ret > 0) && (++comp_read < rxm_ep->comp_per_progress));
 
-	if (OFI_UNLIKELY(!dlist_empty(&rxm_ep->deferred_tx_conn_queue))) {
+	if (!dlist_empty(&rxm_ep->deferred_tx_conn_queue)) {
 		dlist_foreach_container_safe(&rxm_ep->deferred_tx_conn_queue,
 					     struct rxm_conn, rxm_conn,
-					     deferred_conn_entry, conn_entry_tmp)
+					     deferred_conn_entry, conn_entry_tmp) {
 			rxm_ep_progress_deferred_queue(rxm_ep, rxm_conn);
+		}
 	}
 }
 
@@ -1479,7 +1507,7 @@ int rxm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
 		return -FI_ENOMEM;
 
 	ret = ofi_cq_init(&rxm_prov, domain, attr, util_cq, &ofi_cq_progress,
-			context);
+			  context);
 	if (ret)
 		goto err1;
 
diff --git a/deps/libfabric/prov/rxm/src/rxm_domain.c b/deps/libfabric/prov/rxm/src/rxm_domain.c
index 7b3d145cdf356800ff0e5518859e87a568810dd3..bd805a75eb0978d1c2cb9c4dea1dd8ebdac3c5ec 100644
--- a/deps/libfabric/prov/rxm/src/rxm_domain.c
+++ b/deps/libfabric/prov/rxm/src/rxm_domain.c
@@ -212,7 +212,7 @@ int rxm_msg_mr_regv(struct rxm_ep *rxm_ep, const struct iovec *iov,
 	}
 	return 0;
 err:
-	rxm_msg_mr_closev(mr, count);
+	rxm_msg_mr_closev(mr, i);
 	return ret;
 }
 
@@ -355,6 +355,82 @@ static struct fi_ops_mr rxm_domain_mr_ops = {
 	.regattr = rxm_mr_regattr,
 };
 
+static ssize_t rxm_send_credits(struct fid_ep *ep, size_t credits)
+{
+	struct rxm_conn *rxm_conn =
+		container_of(ep->fid.context, struct rxm_conn, handle);
+	struct rxm_ep *rxm_ep = rxm_conn->handle.cmap->ep;
+	struct rxm_deferred_tx_entry *def_tx_entry;
+	struct rxm_tx_base_buf *tx_buf;
+	struct iovec iov;
+	struct fi_msg msg;
+	ssize_t ret;
+
+	tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_CREDIT);
+	if (!tx_buf) {
+		FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
+			"Ran out of buffers from TX credit buffer pool.\n");
+		return -FI_ENOMEM;
+	}
+
+	rxm_ep_format_tx_buf_pkt(rxm_conn, 0, rxm_ctrl_credit, 0, 0, FI_SEND,
+				 &tx_buf->pkt);
+	tx_buf->pkt.ctrl_hdr.type = rxm_ctrl_credit;
+	tx_buf->pkt.ctrl_hdr.msg_id = ofi_buf_index(tx_buf);
+	tx_buf->pkt.ctrl_hdr.ctrl_data = credits;
+
+	if (rxm_conn->handle.state != RXM_CMAP_CONNECTED)
+		goto defer;
+
+	iov.iov_base = &tx_buf->pkt;
+	iov.iov_len = sizeof(struct rxm_pkt);
+	msg.msg_iov = &iov;
+	msg.iov_count = 1;
+	msg.context = tx_buf;
+	msg.desc = &tx_buf->hdr.desc;
+
+	ret = fi_sendmsg(ep, &msg, FI_PRIORITY);
+	if (!ret)
+		return FI_SUCCESS;
+
+defer:
+	def_tx_entry = rxm_ep_alloc_deferred_tx_entry(
+		rxm_ep, rxm_conn, RXM_DEFERRED_TX_CREDIT_SEND);
+	if (!def_tx_entry) {
+		FI_WARN(&rxm_prov, FI_LOG_CQ,
+			"unable to allocate TX entry for deferred CREDIT mxg\n");
+		ofi_buf_free(tx_buf);
+		return -FI_ENOMEM;
+	}
+
+	def_tx_entry->credit_msg.tx_buf = tx_buf;
+	rxm_ep_enqueue_deferred_tx_queue(def_tx_entry);
+	return FI_SUCCESS;
+}
+
+static void rxm_no_set_threshold(struct fid_ep *ep_fid, size_t threshold)
+{ }
+
+static void rxm_no_add_credits(struct fid_ep *ep_fid, size_t credits)
+{ }
+
+static void rxm_no_credit_handler(struct fid_domain *domain_fid,
+		ssize_t (*credit_handler)(struct fid_ep *ep, size_t credits))
+{ }
+
+static int rxm_no_enable_flow_ctrl(struct fid_ep *ep_fid)
+{
+	return -FI_ENOSYS;
+}
+
+struct ofi_ops_flow_ctrl rxm_no_ops_flow_ctrl = {
+	.size = sizeof(struct ofi_ops_flow_ctrl),
+	.set_threshold = rxm_no_set_threshold,
+	.add_credits = rxm_no_add_credits,
+	.enable = rxm_no_enable_flow_ctrl,
+	.set_send_handler = rxm_no_credit_handler,
+};
+
 int rxm_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 		struct fid_domain **domain, void *context)
 {
@@ -362,6 +438,7 @@ int rxm_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 	struct rxm_domain *rxm_domain;
 	struct rxm_fabric *rxm_fabric;
 	struct fi_info *msg_info;
+	struct ofi_ops_flow_ctrl *flow_ctrl_ops;
 
 	rxm_domain = calloc(1, sizeof(*rxm_domain));
 	if (!rxm_domain)
@@ -370,7 +447,7 @@ int rxm_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 	rxm_fabric = container_of(fabric, struct rxm_fabric, util_fabric.fabric_fid);
 
 	ret = ofi_get_core_info(fabric->api_version, NULL, NULL, 0, &rxm_util_prov,
-				info, rxm_info_to_core, &msg_info);
+				info, NULL, rxm_info_to_core, &msg_info);
 	if (ret)
 		goto err1;
 
@@ -399,6 +476,18 @@ int rxm_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 
 	rxm_domain->mr_local = ofi_mr_local(msg_info) && !ofi_mr_local(info);
 
+	ret = fi_open_ops(&rxm_domain->msg_domain->fid, OFI_OPS_FLOW_CTRL, 0,
+			  (void **) &flow_ctrl_ops, NULL);
+	if (!ret && flow_ctrl_ops) {
+		rxm_domain->flow_ctrl_ops = flow_ctrl_ops;
+		rxm_domain->flow_ctrl_ops->set_send_handler(
+			rxm_domain->msg_domain, rxm_send_credits);
+	} else if (ret == -FI_ENOSYS) {
+		rxm_domain->flow_ctrl_ops = &rxm_no_ops_flow_ctrl;
+	} else {
+		goto err3;
+	}
+
 	fi_freeinfo(msg_info);
 	return 0;
 err3:
diff --git a/deps/libfabric/prov/rxm/src/rxm_ep.c b/deps/libfabric/prov/rxm/src/rxm_ep.c
index 5c927343be37735f85887e471e2a1c649005c305..de3e858696de59eaf458fd1cf5f2443f1dbe0211 100644
--- a/deps/libfabric/prov/rxm/src/rxm_ep.c
+++ b/deps/libfabric/prov/rxm/src/rxm_ep.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016 Intel Corporation. All rights reserved.
+ * Copyright (c) 2013-2020 Intel Corporation. All rights reserved.
  * Copyright (c) 2020 Cisco Systems, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -59,7 +59,7 @@ static int rxm_match_recv_entry(struct dlist_entry *item, const void *arg)
 
 static int rxm_match_recv_entry_tag(struct dlist_entry *item, const void *arg)
 {
-	struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *)arg;
+	struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg;
 	struct rxm_recv_entry *recv_entry =
 		container_of(item, struct rxm_recv_entry, entry);
 	return ofi_match_tag(recv_entry->tag, recv_entry->ignore, attr->tag);
@@ -67,7 +67,7 @@ static int rxm_match_recv_entry_tag(struct dlist_entry *item, const void *arg)
 
 static int rxm_match_recv_entry_tag_addr(struct dlist_entry *item, const void *arg)
 {
-	struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *)arg;
+	struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg;
 	struct rxm_recv_entry *recv_entry =
 		container_of(item, struct rxm_recv_entry, entry);
 	return ofi_match_addr(recv_entry->addr, attr->addr) &&
@@ -91,7 +91,7 @@ static int rxm_match_unexp_msg(struct dlist_entry *item, const void *arg)
 
 static int rxm_match_unexp_msg_tag(struct dlist_entry *item, const void *arg)
 {
-	struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *)arg;
+	struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg;
 	struct rxm_unexp_msg *unexp_msg =
 		container_of(item, struct rxm_unexp_msg, entry);
 	return ofi_match_tag(attr->tag, attr->ignore, unexp_msg->tag);
@@ -99,7 +99,7 @@ static int rxm_match_unexp_msg_tag(struct dlist_entry *item, const void *arg)
 
 static int rxm_match_unexp_msg_tag_addr(struct dlist_entry *item, const void *arg)
 {
-	struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *)arg;
+	struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg;
 	struct rxm_unexp_msg *unexp_msg =
 		container_of(item, struct rxm_unexp_msg, entry);
 	return ofi_match_addr(attr->addr, unexp_msg->addr) &&
@@ -141,7 +141,8 @@ static void rxm_buf_init(struct ofi_bufpool_region *region, void *buf)
 	void *mr_desc;
 	uint8_t type;
 
-	if ((pool->type != RXM_BUF_POOL_TX_INJECT) && pool->rxm_ep->msg_mr_local) {
+	if ((pool->type != RXM_BUF_POOL_TX_INJECT) &&
+	    pool->rxm_ep->msg_mr_local) {
 		mr_desc = fi_mr_desc((struct fid_mr *) region->context);
 	} else {
 		mr_desc = NULL;
@@ -179,6 +180,14 @@ static void rxm_buf_init(struct ofi_bufpool_region *region, void *buf)
 		pkt = &tx_sar_buf->pkt;
 		type = rxm_ctrl_seg;
 		break;
+	case RXM_BUF_POOL_TX_CREDIT:
+		tx_base_buf = buf;
+		tx_base_buf->hdr.state = RXM_CREDIT_TX;
+
+		tx_base_buf->hdr.desc = mr_desc;
+		pkt = &tx_base_buf->pkt;
+		type = rxm_ctrl_credit;
+		break;
 	case RXM_BUF_POOL_TX_RNDV:
 		tx_rndv_buf = buf;
 
@@ -223,7 +232,7 @@ static void rxm_buf_init(struct ofi_bufpool_region *region, void *buf)
 	}
 }
 
-static inline void rxm_buf_close(struct ofi_bufpool_region *region)
+static void rxm_buf_close(struct ofi_bufpool_region *region)
 {
 	struct rxm_buf_pool *pool = region->pool->attr.context;
 	struct rxm_ep *rxm_ep = pool->rxm_ep;
@@ -265,7 +274,8 @@ static int rxm_buf_pool_create(struct rxm_ep *rxm_ep, size_t size,
 	pool->type = type;
 	ret = ofi_bufpool_create_attr(&attr, &pool->pool);
 	if (ret)
-		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "Unable to create buf pool\n");
+		FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
+			"Unable to create buf pool\n");
 
 	return ret;
 }
@@ -295,7 +305,8 @@ static int rxm_recv_queue_init(struct rxm_ep *rxm_ep,  struct rxm_recv_queue *re
 {
 	recv_queue->rxm_ep = rxm_ep;
 	recv_queue->type = type;
-	recv_queue->fs = rxm_recv_fs_create(size, rxm_recv_entry_init, recv_queue);
+	recv_queue->fs = rxm_recv_fs_create(size, rxm_recv_entry_init,
+					    recv_queue);
 	if (!recv_queue->fs)
 		return -FI_ENOMEM;
 
@@ -342,6 +353,7 @@ static int rxm_ep_txrx_pool_create(struct rxm_ep *rxm_ep)
 		[RXM_BUF_POOL_TX_RNDV] = rxm_ep->msg_info->tx_attr->size,
 		[RXM_BUF_POOL_TX_ATOMIC] = rxm_ep->msg_info->tx_attr->size,
 		[RXM_BUF_POOL_TX_SAR] = rxm_ep->msg_info->tx_attr->size,
+		[RXM_BUF_POOL_TX_CREDIT] = rxm_ep->msg_info->tx_attr->size,
 		[RXM_BUF_POOL_RMA] = rxm_ep->msg_info->tx_attr->size,
 	};
 	size_t entry_sizes[] = {
@@ -359,13 +371,15 @@ static int rxm_ep_txrx_pool_create(struct rxm_ep *rxm_ep)
 					 sizeof(struct rxm_tx_atomic_buf),
 		[RXM_BUF_POOL_TX_SAR] = rxm_eager_limit +
 					sizeof(struct rxm_tx_sar_buf),
+		[RXM_BUF_POOL_TX_CREDIT] = sizeof(struct rxm_tx_base_buf),
 		[RXM_BUF_POOL_RMA] = rxm_eager_limit +
 				     sizeof(struct rxm_rma_buf),
 	};
 
 	dlist_init(&rxm_ep->repost_ready_list);
 
-	rxm_ep->buf_pools = calloc(1, RXM_BUF_POOL_MAX * sizeof(*rxm_ep->buf_pools));
+	rxm_ep->buf_pools = calloc(1, RXM_BUF_POOL_MAX *
+				      sizeof(*rxm_ep->buf_pools));
 	if (!rxm_ep->buf_pools)
 		return -FI_ENOMEM;
 
@@ -385,6 +399,7 @@ static int rxm_ep_txrx_pool_create(struct rxm_ep *rxm_ep)
 	}
 
 	return FI_SUCCESS;
+
 err:
 	while (--i >= RXM_BUF_POOL_START)
 		rxm_buf_pool_destroy(&rxm_ep->buf_pools[i]);
@@ -418,6 +433,7 @@ static int rxm_ep_rx_queue_init(struct rxm_ep *rxm_ep)
 		goto err_recv_tag;
 
 	return FI_SUCCESS;
+
 err_recv_tag:
 	rxm_recv_queue_close(&rxm_ep->recv_queue);
 	return ret;
@@ -457,13 +473,14 @@ static int rxm_getname(fid_t fid, void *addr, size_t *addrlen)
 static int rxm_join_coll(struct fid_ep *ep, const void *addr, uint64_t flags,
 		    struct fid_mc **mc, void *context)
 {
-	if((flags & FI_COLLECTIVE) == 0) {
+	struct fi_collective_addr *c_addr;
+
+	if (!(flags & FI_COLLECTIVE))
 		return -FI_ENOSYS;
-	}
 
-	struct fi_collective_addr *c_addr = (struct fi_collective_addr *) addr;
+	c_addr = (struct fi_collective_addr *) addr;
 	return ofi_join_collective(ep, c_addr->coll_addr, c_addr->set, flags,
-				mc, context);
+				   mc, context);
 }
 
 static struct fi_ops_cm rxm_ops_cm = {
@@ -479,18 +496,14 @@ static struct fi_ops_cm rxm_ops_cm = {
 	.join = rxm_join_coll,
 };
 
-static struct rxm_handle_txrx_ops rxm_rx_ops = {
-	.comp_eager_tx = rxm_finish_eager_send,
-	.handle_eager_rx = rxm_cq_handle_eager,
-	.handle_rndv_rx = rxm_cq_handle_rndv,
-	.handle_seg_data_rx = rxm_cq_handle_seg_data,
+static struct rxm_eager_ops def_eager_ops = {
+	.comp_tx = rxm_finish_eager_send,
+	.handle_rx = rxm_handle_eager,
 };
 
-static struct rxm_handle_txrx_ops rxm_coll_rx_ops = {
-	.comp_eager_tx = rxm_finish_coll_eager_send,
-	.handle_eager_rx = rxm_cq_handle_coll_eager,
-	.handle_rndv_rx = rxm_cq_handle_rndv,
-	.handle_seg_data_rx = rxm_cq_handle_seg_data,
+static struct rxm_eager_ops coll_eager_ops = {
+	.comp_tx = rxm_finish_coll_eager_send,
+	.handle_rx = rxm_handle_coll_eager,
 };
 
 static int rxm_ep_cancel_recv(struct rxm_ep *rxm_ep,
@@ -651,9 +664,9 @@ rxm_get_unexp_msg(struct rxm_recv_queue *recv_queue, fi_addr_t addr,
 	if (dlist_empty(&recv_queue->unexp_msg_list))
 		return NULL;
 
-	match_attr.addr 	= addr;
-	match_attr.tag 		= tag;
-	match_attr.ignore 	= ignore;
+	match_attr.addr = addr;
+	match_attr.tag = tag;
+	match_attr.ignore = ignore;
 
 	entry = dlist_find_first_match(&recv_queue->unexp_msg_list,
 				       recv_queue->match_unexp, &match_attr);
@@ -670,11 +683,12 @@ static int rxm_handle_unexp_sar(struct rxm_recv_queue *recv_queue,
 				struct rxm_recv_entry *recv_entry,
 				struct rxm_rx_buf *rx_buf)
 {
+	struct rxm_recv_match_attr match_attr;
 	struct dlist_entry *entry;
 	bool last;
-	ssize_t ret = rxm_cq_handle_rx_buf(rx_buf);
-	struct rxm_recv_match_attr match_attr;
+	ssize_t ret;
 
+	ret = rxm_handle_rx_buf(rx_buf);
 	last = rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST;
 	if (ret || last)
 		return ret;
@@ -704,7 +718,7 @@ static int rxm_handle_unexp_sar(struct rxm_recv_queue *recv_queue,
 		dlist_remove(&rx_buf->unexp_msg.entry);
 		last = rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) ==
 		       RXM_SAR_SEG_LAST;
-		ret = rxm_cq_handle_rx_buf(rx_buf);
+		ret = rxm_handle_rx_buf(rx_buf);
 		if (ret || last)
 			break;
 	}
@@ -715,13 +729,14 @@ static int rxm_handle_unexp_sar(struct rxm_recv_queue *recv_queue,
 static int rxm_ep_discard_recv(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf,
 			       void *context)
 {
+	int ret;
 	RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Discarding message",
 			 rx_buf->unexp_msg.addr, rx_buf->unexp_msg.tag);
 
-	dlist_insert_tail(&rx_buf->repost_entry,
-			  &rx_buf->ep->repost_ready_list);
-	return ofi_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV,
+	ret = ofi_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV,
 			    0, NULL, rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag);
+	rxm_rx_buf_free(rx_buf);
+	return ret;
 }
 
 static int rxm_ep_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag,
@@ -835,7 +850,7 @@ rxm_ep_post_mrecv(struct rxm_ep *ep, const struct iovec *iov,
 		cur_iov.iov_len -= recv_entry->total_len;
 
 		if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg)
-			ret = rxm_cq_handle_rx_buf(rx_buf);
+			ret = rxm_handle_rx_buf(rx_buf);
 		else
 			ret = rxm_handle_unexp_sar(&ep->recv_queue, recv_entry,
 						   rx_buf);
@@ -876,12 +891,11 @@ rxm_ep_post_recv(struct rxm_ep *rxm_ep, const struct iovec *iov,
 		return FI_SUCCESS;
 	}
 
-	/* TODO: handle multi-recv */
 	dlist_remove(&rx_buf->unexp_msg.entry);
 	rx_buf->recv_entry = recv_entry;
 
 	if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg)
-		return rxm_cq_handle_rx_buf(rx_buf);
+		return rxm_handle_rx_buf(rx_buf);
 	else
 		return rxm_handle_unexp_sar(&rxm_ep->recv_queue, recv_entry,
 					    rx_buf);
@@ -931,12 +945,11 @@ rxm_ep_buf_recv(struct rxm_ep *rxm_ep, const struct iovec *iov,
 		recv_entry->comp_flags |= FI_CLAIM;
 
 		rx_buf->recv_entry = recv_entry;
-		ret = rxm_cq_handle_rx_buf(rx_buf);
+		ret = rxm_handle_rx_buf(rx_buf);
 	} else {
 		assert(flags & FI_DISCARD);
 		FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Discarding buffered receive\n");
-		dlist_insert_tail(&rx_buf->repost_entry,
-				  &rx_buf->ep->repost_ready_list);
+		rxm_rx_buf_free(rx_buf);
 	}
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
@@ -960,8 +973,8 @@ rxm_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags)
 
 }
 
-static ssize_t rxm_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc,
-			    fi_addr_t src_addr, void *context)
+static ssize_t rxm_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len,
+			   void *desc, fi_addr_t src_addr, void *context)
 {
 	struct rxm_ep *rxm_ep =
 		container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
@@ -975,7 +988,8 @@ static ssize_t rxm_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, void *d
 }
 
 static ssize_t rxm_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov,
-		void **desc, size_t count, fi_addr_t src_addr, void *context)
+			    void **desc, size_t count, fi_addr_t src_addr,
+			    void *context)
 {
 	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
 					     util_ep.ep_fid.fid);
@@ -1000,7 +1014,7 @@ static void rxm_rndv_hdr_init(struct rxm_ep *rxm_ep, void *buf,
 	rndv_hdr->count = (uint8_t)count;
 }
 
-static inline ssize_t
+static ssize_t
 rxm_ep_msg_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 		       struct rxm_pkt *tx_pkt, size_t pkt_size,
 		       ofi_cntr_inc_func cntr_inc_func)
@@ -1017,7 +1031,7 @@ rxm_ep_msg_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 	return ret;
 }
 
-static inline ssize_t
+static ssize_t
 rxm_ep_msg_normal_send(struct rxm_conn *rxm_conn, struct rxm_pkt *tx_pkt,
 		       size_t pkt_size, void *desc, void *context)
 {
@@ -1029,24 +1043,26 @@ rxm_ep_msg_normal_send(struct rxm_conn *rxm_conn, struct rxm_pkt *tx_pkt,
 	return fi_send(rxm_conn->msg_ep, tx_pkt, pkt_size, desc, 0, context);
 }
 
-static inline ssize_t
-rxm_ep_alloc_rndv_tx_res(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, void *context,
-			uint8_t count, const struct iovec *iov, void **desc, size_t data_len,
-			uint64_t data, uint64_t flags, uint64_t tag, uint8_t op,
-			struct rxm_tx_rndv_buf **tx_rndv_buf)
+static ssize_t
+rxm_ep_alloc_rndv_tx_res(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
+			 void *context, uint8_t count, const struct iovec *iov,
+			 void **desc, size_t data_len, uint64_t data,
+			 uint64_t flags, uint64_t tag, uint8_t op,
+			 struct rxm_tx_rndv_buf **tx_rndv_buf)
 {
 	struct fid_mr **mr_iov;
 	ssize_t ret;
-	struct rxm_tx_rndv_buf *tx_buf = (struct rxm_tx_rndv_buf *)
-			rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_RNDV);
+	struct rxm_tx_rndv_buf *tx_buf;
 
-	if (OFI_UNLIKELY(!tx_buf)) {
+	tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_RNDV);
+	if (!tx_buf) {
 		FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
 			"Ran out of buffers from RNDV buffer pool\n");
 		return -FI_EAGAIN;
 	}
 
-	rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag, flags, &(tx_buf)->pkt);
+	rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag,
+				 flags, &(tx_buf)->pkt);
 	tx_buf->pkt.ctrl_hdr.msg_id = ofi_buf_index(tx_buf);
 	tx_buf->app_context = context;
 	tx_buf->flags = flags;
@@ -1063,7 +1079,8 @@ rxm_ep_alloc_rndv_tx_res(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, void
 		mr_iov = (struct fid_mr **)desc;
 	}
 
-	rxm_rndv_hdr_init(rxm_ep, &tx_buf->pkt.data, iov, tx_buf->count, mr_iov);
+	rxm_rndv_hdr_init(rxm_ep, &tx_buf->pkt.data, iov, tx_buf->count,
+			  mr_iov);
 
 	ret = sizeof(struct rxm_pkt) + sizeof(struct rxm_rndv_hdr);
 
@@ -1081,7 +1098,7 @@ err:
 	return ret;
 }
 
-static inline ssize_t
+static ssize_t
 rxm_ep_rndv_tx_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 		   struct rxm_tx_rndv_buf *tx_buf, size_t pkt_size)
 {
@@ -1098,7 +1115,7 @@ rxm_ep_rndv_tx_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 		ret = rxm_ep_msg_normal_send(rxm_conn, &tx_buf->pkt, pkt_size,
 					     tx_buf->hdr.desc, tx_buf);
 	}
-	if (OFI_UNLIKELY(ret))
+	if (ret)
 		goto err;
 	return FI_SUCCESS;
 err:
@@ -1110,29 +1127,30 @@ err:
 	return ret;
 }
 
-static inline size_t
+static size_t
 rxm_ep_sar_calc_segs_cnt(struct rxm_ep *rxm_ep, size_t data_len)
 {
-	return (data_len + rxm_eager_limit - 1) /
-	       rxm_eager_limit;
+	return (data_len + rxm_eager_limit - 1) / rxm_eager_limit;
 }
 
-static inline struct rxm_tx_sar_buf *
+static struct rxm_tx_sar_buf *
 rxm_ep_sar_tx_prepare_segment(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
-			      void *app_context, size_t total_len, size_t seg_len,
-			      size_t seg_no, uint64_t data, uint64_t flags, uint64_t tag,
-			      uint8_t op, enum rxm_sar_seg_type seg_type, uint64_t *msg_id)
+			      void *app_context, size_t total_len,
+			      size_t seg_len, size_t seg_no, uint64_t data,
+			      uint64_t flags, uint64_t tag, uint8_t op,
+			      enum rxm_sar_seg_type seg_type, uint64_t *msg_id)
 {
-	struct rxm_tx_sar_buf *tx_buf = (struct rxm_tx_sar_buf *)
-		rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_SAR);
+	struct rxm_tx_sar_buf *tx_buf;
 
-	if (OFI_UNLIKELY(!tx_buf)) {
+	tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_SAR);
+	if (!tx_buf) {
 		FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
 			"Ran out of buffers from SAR buffer pool\n");
 		return NULL;
 	};
 
-	rxm_ep_format_tx_buf_pkt(rxm_conn, total_len, op, data, tag, flags, &tx_buf->pkt);
+	rxm_ep_format_tx_buf_pkt(rxm_conn, total_len, op, data, tag, flags,
+				 &tx_buf->pkt);
 	if (seg_type == RXM_SAR_SEG_FIRST) {
 		*msg_id = tx_buf->pkt.ctrl_hdr.msg_id = ofi_buf_index(tx_buf);
 	} else {
@@ -1154,19 +1172,20 @@ rxm_ep_sar_tx_cleanup(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 	struct rxm_tx_sar_buf *first_tx_buf;
 
 	first_tx_buf = ofi_bufpool_get_ibuf(rxm_ep->
-				buf_pools[RXM_BUF_POOL_TX_SAR].pool,
-				tx_buf->pkt.ctrl_hdr.msg_id);
+					    buf_pools[RXM_BUF_POOL_TX_SAR].pool,
+					    tx_buf->pkt.ctrl_hdr.msg_id);
 	ofi_buf_free(first_tx_buf);
 	ofi_buf_free(tx_buf);
 }
 
-static inline ssize_t
-rxm_ep_sar_tx_prepare_and_send_segment(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
-				       void *app_context, size_t data_len, size_t remain_len,
-				       uint64_t msg_id, size_t seg_len, size_t seg_no, size_t segs_cnt,
-				       uint64_t data, uint64_t flags, uint64_t tag, uint8_t op,
-				       const struct iovec *iov, uint8_t count, size_t *iov_offset,
-				       struct rxm_tx_sar_buf **out_tx_buf)
+static ssize_t
+rxm_ep_sar_tx_prepare_and_send_segment(struct rxm_ep *rxm_ep,
+		struct rxm_conn *rxm_conn, void *app_context, size_t data_len,
+		size_t remain_len, uint64_t msg_id, size_t seg_len,
+		size_t seg_no, size_t segs_cnt, uint64_t data, uint64_t flags,
+		uint64_t tag, uint8_t op, const struct iovec *iov,
+		uint8_t count, size_t *iov_offset,
+		struct rxm_tx_sar_buf **out_tx_buf)
 {
 	struct rxm_tx_sar_buf *tx_buf;
 	enum rxm_sar_seg_type seg_type = RXM_SAR_SEG_MIDDLE;
@@ -1177,9 +1196,10 @@ rxm_ep_sar_tx_prepare_and_send_segment(struct rxm_ep *rxm_ep, struct rxm_conn *r
 		seg_len = remain_len;
 	}
 
-	tx_buf = rxm_ep_sar_tx_prepare_segment(rxm_ep, rxm_conn, app_context, data_len, seg_len,
-					       seg_no, data, flags, tag, op, seg_type, &msg_id);
-	if (OFI_UNLIKELY(!tx_buf)) {
+	tx_buf = rxm_ep_sar_tx_prepare_segment(rxm_ep, rxm_conn, app_context,
+					       data_len, seg_len, seg_no, data,
+					       flags, tag, op, seg_type, &msg_id);
+	if (!tx_buf) {
 		*out_tx_buf = NULL;
 		return -FI_EAGAIN;
 	}
@@ -1193,7 +1213,7 @@ rxm_ep_sar_tx_prepare_and_send_segment(struct rxm_ep *rxm_ep, struct rxm_conn *r
 		       tx_buf->pkt.ctrl_hdr.seg_size, tx_buf->hdr.desc, 0, tx_buf);
 }
 
-static inline ssize_t
+static ssize_t
 rxm_ep_sar_tx_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 		   void *context, uint8_t count, const struct iovec *iov,
 		   size_t data_len, size_t segs_cnt, uint64_t data,
@@ -1202,25 +1222,27 @@ rxm_ep_sar_tx_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 	struct rxm_tx_sar_buf *tx_buf, *first_tx_buf;
 	size_t i, iov_offset = 0, remain_len = data_len;
 	ssize_t ret;
-	struct rxm_deferred_tx_entry *def_tx_entry;
+	struct rxm_deferred_tx_entry *def_tx;
 	uint64_t msg_id = 0;
 
 	assert(segs_cnt >= 2);
 
-	first_tx_buf = rxm_ep_sar_tx_prepare_segment(rxm_ep, rxm_conn, context, data_len,
-						     rxm_eager_limit, 0, data, flags,
-						     tag, op, RXM_SAR_SEG_FIRST, &msg_id);
-	if (OFI_UNLIKELY(!first_tx_buf))
+	first_tx_buf = rxm_ep_sar_tx_prepare_segment(rxm_ep, rxm_conn, context,
+						     data_len, rxm_eager_limit,
+						     0, data, flags, tag, op,
+						     RXM_SAR_SEG_FIRST, &msg_id);
+	if (!first_tx_buf)
 		return -FI_EAGAIN;
 
 	ofi_copy_from_iov(first_tx_buf->pkt.data, rxm_eager_limit,
 			  iov, count, iov_offset);
 	iov_offset += rxm_eager_limit;
 
-	ret = fi_send(rxm_conn->msg_ep, &first_tx_buf->pkt, sizeof(struct rxm_pkt) +
-		      first_tx_buf->pkt.ctrl_hdr.seg_size, first_tx_buf->hdr.desc, 0, first_tx_buf);
-	if (OFI_UNLIKELY(ret)) {
-		if (OFI_LIKELY(ret == -FI_EAGAIN))
+	ret = fi_send(rxm_conn->msg_ep, &first_tx_buf->pkt,
+		      sizeof(struct rxm_pkt) + first_tx_buf->pkt.ctrl_hdr.seg_size,
+		      first_tx_buf->hdr.desc, 0, first_tx_buf);
+	if (ret) {
+		if (ret == -FI_EAGAIN)
 			rxm_ep_do_progress(&rxm_ep->util_ep);
 		ofi_buf_free(first_tx_buf);
 		return ret;
@@ -1229,47 +1251,52 @@ rxm_ep_sar_tx_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 	remain_len -= rxm_eager_limit;
 
 	for (i = 1; i < segs_cnt; i++) {
-		ret = rxm_ep_sar_tx_prepare_and_send_segment(
-					rxm_ep, rxm_conn, context, data_len, remain_len,
-					msg_id, rxm_eager_limit, i, segs_cnt, data,
-					flags, tag, op, iov, count, &iov_offset, &tx_buf);
-		if (OFI_UNLIKELY(ret)) {
-			if (OFI_LIKELY(ret == -FI_EAGAIN)) {
-				def_tx_entry = rxm_ep_alloc_deferred_tx_entry(rxm_ep, rxm_conn,
-									      RXM_DEFERRED_TX_SAR_SEG);
-				if (OFI_UNLIKELY(!def_tx_entry)) {
-					if (tx_buf)
-						ofi_buf_free(tx_buf);
-					return -FI_ENOMEM;
-				}
-				memcpy(def_tx_entry->sar_seg.payload.iov, iov, sizeof(*iov) * count);
-				def_tx_entry->sar_seg.payload.count = count;
-				def_tx_entry->sar_seg.payload.cur_iov_offset = iov_offset;
-				def_tx_entry->sar_seg.payload.tag = tag;
-				def_tx_entry->sar_seg.payload.data = data;
-				def_tx_entry->sar_seg.cur_seg_tx_buf = tx_buf;
-				def_tx_entry->sar_seg.app_context = context;
-				def_tx_entry->sar_seg.flags = flags;
-				def_tx_entry->sar_seg.op = op;
-				def_tx_entry->sar_seg.next_seg_no = i;
-				def_tx_entry->sar_seg.segs_cnt = segs_cnt;
-				def_tx_entry->sar_seg.total_len = data_len;
-				def_tx_entry->sar_seg.remain_len = remain_len;
-				def_tx_entry->sar_seg.msg_id = msg_id;
-				rxm_ep_enqueue_deferred_tx_queue(def_tx_entry);
-				return 0;
-			}
-
-			ofi_buf_free(first_tx_buf);
-			return ret;
+		ret = rxm_ep_sar_tx_prepare_and_send_segment(rxm_ep, rxm_conn,
+				context, data_len, remain_len,
+				msg_id, rxm_eager_limit, i, segs_cnt,
+				data, flags, tag, op, iov, count,
+				&iov_offset, &tx_buf);
+		if (ret) {
+			if (ret == -FI_EAGAIN)
+				goto defer;
+			goto free;
 		}
 		remain_len -= rxm_eager_limit;
 	}
 
 	return 0;
+
+free:
+	ofi_buf_free(first_tx_buf);
+	return ret;
+defer:
+	def_tx = rxm_ep_alloc_deferred_tx_entry(rxm_ep,
+			rxm_conn, RXM_DEFERRED_TX_SAR_SEG);
+	if (!def_tx) {
+		if (tx_buf)
+			ofi_buf_free(tx_buf);
+		return -FI_ENOMEM;
+	}
+	memcpy(def_tx->sar_seg.payload.iov,
+		iov, sizeof(*iov) * count);
+	def_tx->sar_seg.payload.count = count;
+	def_tx->sar_seg.payload.cur_iov_offset = iov_offset;
+	def_tx->sar_seg.payload.tag = tag;
+	def_tx->sar_seg.payload.data = data;
+	def_tx->sar_seg.cur_seg_tx_buf = tx_buf;
+	def_tx->sar_seg.app_context = context;
+	def_tx->sar_seg.flags = flags;
+	def_tx->sar_seg.op = op;
+	def_tx->sar_seg.next_seg_no = i;
+	def_tx->sar_seg.segs_cnt = segs_cnt;
+	def_tx->sar_seg.total_len = data_len;
+	def_tx->sar_seg.remain_len = remain_len;
+	def_tx->sar_seg.msg_id = msg_id;
+	rxm_ep_enqueue_deferred_tx_queue(def_tx);
+	return 0;
 }
 
-static inline ssize_t
+static ssize_t
 rxm_ep_emulate_inject(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 		      const void *buf, size_t len, size_t pkt_size,
 		      uint64_t data, uint64_t flags, uint64_t tag,
@@ -1278,9 +1305,8 @@ rxm_ep_emulate_inject(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 	struct rxm_tx_eager_buf *tx_buf;
 	ssize_t ret;
 
-	tx_buf = (struct rxm_tx_eager_buf *)
-		  rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX);
-	if (OFI_UNLIKELY(!tx_buf)) {
+	tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX);
+	if (!tx_buf) {
 		FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
 			"Ran out of buffers from Eager buffer pool\n");
 		return -FI_EAGAIN;
@@ -1294,15 +1320,15 @@ rxm_ep_emulate_inject(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 
 	ret = rxm_ep_msg_normal_send(rxm_conn, &tx_buf->pkt, pkt_size,
 				     tx_buf->hdr.desc, tx_buf);
-	if (OFI_UNLIKELY(ret)) {
-		if (OFI_LIKELY(ret == -FI_EAGAIN))
+	if (ret) {
+		if (ret == -FI_EAGAIN)
 			rxm_ep_do_progress(&rxm_ep->util_ep);
 		ofi_buf_free(tx_buf);
 	}
 	return ret;
 }
 
-static inline ssize_t
+static ssize_t
 rxm_ep_inject_send_fast(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 			const void *buf, size_t len, struct rxm_pkt *inject_pkt)
 {
@@ -1316,20 +1342,24 @@ rxm_ep_inject_send_fast(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 		inject_pkt->hdr.size = len;
 		memcpy(inject_pkt->data, buf, len);
 		ret = rxm_ep_msg_inject_send(rxm_ep, rxm_conn, inject_pkt,
-					      pkt_size, rxm_ep->util_ep.tx_cntr_inc);
+					     pkt_size,
+					     rxm_ep->util_ep.tx_cntr_inc);
 	} else {
-		ret = rxm_ep_emulate_inject(rxm_ep, rxm_conn, buf, len, pkt_size,
-					    inject_pkt->hdr.data, inject_pkt->hdr.flags,
-					    inject_pkt->hdr.tag, inject_pkt->hdr.op);
+		ret = rxm_ep_emulate_inject(rxm_ep, rxm_conn, buf, len,
+					    pkt_size, inject_pkt->hdr.data,
+					    inject_pkt->hdr.flags,
+					    inject_pkt->hdr.tag,
+					    inject_pkt->hdr.op);
 	}
 	return ret;
 }
 
-static inline ssize_t
+static ssize_t
 rxm_ep_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 		   const void *buf, size_t len, uint64_t data,
 		   uint64_t flags, uint64_t tag, uint8_t op)
 {
+	struct rxm_tx_base_buf *tx_buf;
 	size_t pkt_size = sizeof(struct rxm_pkt) + len;
 	ssize_t ret;
 
@@ -1337,11 +1367,10 @@ rxm_ep_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 
 	if (pkt_size <= rxm_ep->inject_limit &&
 	    !rxm_ep->util_ep.tx_cntr) {
-		struct rxm_tx_base_buf *tx_buf = (struct rxm_tx_base_buf *)
-			rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_INJECT);
-		if (OFI_UNLIKELY(!tx_buf)) {
+		tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_INJECT);
+		if (!tx_buf) {
 			FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
-				"Ran out of buffers from Eager Inject buffer pool\n");
+				"Ran out of eager inject buffers\n");
 			ret = -FI_EAGAIN;
 			goto unlock;
 		}
@@ -1350,7 +1379,8 @@ rxm_ep_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 		memcpy(tx_buf->pkt.data, buf, len);
 
 		ret = rxm_ep_msg_inject_send(rxm_ep, rxm_conn, &tx_buf->pkt,
-					     pkt_size, rxm_ep->util_ep.tx_cntr_inc);
+					     pkt_size,
+					     rxm_ep->util_ep.tx_cntr_inc);
 		ofi_buf_free(tx_buf);
 	} else {
 		ret = rxm_ep_emulate_inject(rxm_ep, rxm_conn, buf, len,
@@ -1367,6 +1397,7 @@ rxm_ep_send_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 		   void *context, uint64_t data, uint64_t flags, uint64_t tag,
 		   uint8_t op, struct rxm_pkt *inject_pkt)
 {
+	struct rxm_tx_eager_buf *tx_buf;
 	size_t data_len = ofi_total_iov_len(iov, count);
 	size_t total_len = sizeof(struct rxm_pkt) + data_len;
 	ssize_t ret;
@@ -1377,10 +1408,8 @@ rxm_ep_send_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 	       (data_len <= rxm_ep->rxm_info->tx_attr->inject_size));
 
 	if (data_len <= rxm_eager_limit) {
-		struct rxm_tx_eager_buf *tx_buf = (struct rxm_tx_eager_buf *)
-			rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX);
-
-		if (OFI_UNLIKELY(!tx_buf)) {
+		tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX);
+		if (!tx_buf) {
 			FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
 				"Ran out of buffers from Eager buffer pool\n");
 			ret = -FI_EAGAIN;
@@ -1396,7 +1425,7 @@ rxm_ep_send_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 
 		ret = rxm_ep_msg_normal_send(rxm_conn, &tx_buf->pkt, total_len,
 					     tx_buf->hdr.desc, tx_buf);
-		if (OFI_UNLIKELY(ret)) {
+		if (ret) {
 			if (ret == -FI_EAGAIN)
 				rxm_ep_do_progress(&rxm_ep->util_ep);
 			ofi_buf_free(tx_buf);
@@ -1412,10 +1441,11 @@ rxm_ep_send_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 	} else {
 		struct rxm_tx_rndv_buf *tx_buf;
 
-		ret = rxm_ep_alloc_rndv_tx_res(rxm_ep, rxm_conn, context, (uint8_t)count,
-					      iov, desc, data_len, data, flags, tag, op,
-					      &tx_buf);
-		if (OFI_LIKELY(ret >= 0))
+		ret = rxm_ep_alloc_rndv_tx_res(rxm_ep, rxm_conn, context,
+					       (uint8_t) count, iov, desc,
+					       data_len, data, flags, tag, op,
+					       &tx_buf);
+		if (ret >= 0)
 			ret = rxm_ep_rndv_tx_send(rxm_ep, rxm_conn, tx_buf, ret);
 	}
 unlock:
@@ -1426,9 +1456,10 @@ struct rxm_deferred_tx_entry *
 rxm_ep_alloc_deferred_tx_entry(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 			       enum rxm_deferred_tx_entry_type type)
 {
-	struct rxm_deferred_tx_entry *def_tx_entry =
-			calloc(1, sizeof(*def_tx_entry));
-	if (OFI_UNLIKELY(!def_tx_entry))
+	struct rxm_deferred_tx_entry *def_tx_entry;
+
+	def_tx_entry = calloc(1, sizeof(*def_tx_entry));
+	if (!def_tx_entry)
 		return NULL;
 
 	def_tx_entry->rxm_ep = rxm_ep;
@@ -1439,8 +1470,9 @@ rxm_ep_alloc_deferred_tx_entry(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 	return def_tx_entry;
 }
 
-static inline void
-rxm_ep_sar_handle_segment_failure(struct rxm_deferred_tx_entry *def_tx_entry, ssize_t ret)
+static void
+rxm_ep_sar_handle_segment_failure(struct rxm_deferred_tx_entry *def_tx_entry,
+				ssize_t ret)
 {
 	rxm_ep_sar_tx_cleanup(def_tx_entry->rxm_ep, def_tx_entry->rxm_conn,
 			      def_tx_entry->sar_seg.cur_seg_tx_buf);
@@ -1458,10 +1490,11 @@ rxm_ep_progress_sar_deferred_segments(struct rxm_deferred_tx_entry *def_tx_entry
 	struct rxm_tx_sar_buf *tx_buf = def_tx_entry->sar_seg.cur_seg_tx_buf;
 
 	if (tx_buf) {
-		ret = fi_send(def_tx_entry->rxm_conn->msg_ep, &tx_buf->pkt, sizeof(tx_buf->pkt) +
-			      tx_buf->pkt.ctrl_hdr.seg_size, tx_buf->hdr.desc, 0, tx_buf);
-		if (OFI_UNLIKELY(ret)) {
-			if (OFI_LIKELY(ret != -FI_EAGAIN)) {
+		ret = fi_send(def_tx_entry->rxm_conn->msg_ep, &tx_buf->pkt,
+			      sizeof(tx_buf->pkt) + tx_buf->pkt.ctrl_hdr.seg_size,
+			      tx_buf->hdr.desc, 0, tx_buf);
+		if (ret) {
+			if (ret != -FI_EAGAIN) {
 				rxm_ep_sar_handle_segment_failure(def_tx_entry, ret);
 				goto sar_finish;
 			}
@@ -1471,28 +1504,34 @@ rxm_ep_progress_sar_deferred_segments(struct rxm_deferred_tx_entry *def_tx_entry
 		def_tx_entry->sar_seg.next_seg_no++;
 		def_tx_entry->sar_seg.remain_len -= rxm_eager_limit;
 
-		if (def_tx_entry->sar_seg.next_seg_no == def_tx_entry->sar_seg.segs_cnt) {
+		if (def_tx_entry->sar_seg.next_seg_no ==
+		    def_tx_entry->sar_seg.segs_cnt) {
 			assert(rxm_sar_get_seg_type(&tx_buf->pkt.ctrl_hdr) ==
 			       RXM_SAR_SEG_LAST);
 			goto sar_finish;
 		}
 	}
 
-	while (def_tx_entry->sar_seg.next_seg_no != def_tx_entry->sar_seg.segs_cnt) {
+	while (def_tx_entry->sar_seg.next_seg_no !=
+	       def_tx_entry->sar_seg.segs_cnt) {
 		ret = rxm_ep_sar_tx_prepare_and_send_segment(
 				def_tx_entry->rxm_ep, def_tx_entry->rxm_conn,
 				def_tx_entry->sar_seg.app_context,
-				def_tx_entry->sar_seg.total_len, def_tx_entry->sar_seg.remain_len,
+				def_tx_entry->sar_seg.total_len,
+				def_tx_entry->sar_seg.remain_len,
 				def_tx_entry->sar_seg.msg_id, rxm_eager_limit,
-				def_tx_entry->sar_seg.next_seg_no, def_tx_entry->sar_seg.segs_cnt,
-				def_tx_entry->sar_seg.payload.data, def_tx_entry->sar_seg.flags,
-				def_tx_entry->sar_seg.payload.tag, def_tx_entry->sar_seg.op,
+				def_tx_entry->sar_seg.next_seg_no,
+				def_tx_entry->sar_seg.segs_cnt,
+				def_tx_entry->sar_seg.payload.data,
+				def_tx_entry->sar_seg.flags,
+				def_tx_entry->sar_seg.payload.tag,
+				def_tx_entry->sar_seg.op,
 				def_tx_entry->sar_seg.payload.iov,
 				def_tx_entry->sar_seg.payload.count,
 				&def_tx_entry->sar_seg.payload.cur_iov_offset,
 				&def_tx_entry->sar_seg.cur_seg_tx_buf);
-		if (OFI_UNLIKELY(ret)) {
-			if (OFI_LIKELY(ret != -FI_EAGAIN)) {
+		if (ret) {
+			if (ret != -FI_EAGAIN) {
 				rxm_ep_sar_handle_segment_failure(def_tx_entry, ret);
 				goto sar_finish;
 			}
@@ -1514,8 +1553,13 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep,
 				    struct rxm_conn *rxm_conn)
 {
 	struct rxm_deferred_tx_entry *def_tx_entry;
+	struct iovec iov;
+	struct fi_msg msg;
 	ssize_t ret = 0;
 
+	if (rxm_conn->handle.state != RXM_CMAP_CONNECTED)
+		return;
+
 	while (!dlist_empty(&rxm_conn->deferred_tx_queue) && !ret) {
 		def_tx_entry = container_of(rxm_conn->deferred_tx_queue.next,
 					    struct rxm_deferred_tx_entry, entry);
@@ -1529,8 +1573,8 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep,
 				      def_tx_entry->rndv_ack.rx_buf->recv_entry->
 					rndv.tx_buf->hdr.desc,
 				      0, def_tx_entry->rndv_ack.rx_buf);
-			if (OFI_UNLIKELY(ret)) {
-				if (OFI_LIKELY(ret == -FI_EAGAIN))
+			if (ret) {
+				if (ret == -FI_EAGAIN)
 					break;
 				rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq,
 						   def_tx_entry->rxm_ep->util_ep.rx_cntr,
@@ -1551,8 +1595,8 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep,
 				       def_tx_entry->rndv_read.rma_iov.addr,
 				       def_tx_entry->rndv_read.rma_iov.key,
 				       def_tx_entry->rndv_read.rx_buf);
-			if (OFI_UNLIKELY(ret)) {
-				if (OFI_LIKELY(ret == -FI_EAGAIN))
+			if (ret) {
+				if (ret == -FI_EAGAIN)
 					break;
 				rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq,
 						   def_tx_entry->rxm_ep->util_ep.rx_cntr,
@@ -1571,34 +1615,60 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep,
 					def_tx_entry->rxm_conn,
 					def_tx_entry->atomic_resp.tx_buf,
 					def_tx_entry->atomic_resp.len);
-			if (OFI_UNLIKELY(ret))
-				if (OFI_LIKELY(ret == -FI_EAGAIN))
+			if (ret)
+				if (ret == -FI_EAGAIN)
 					break;
 			rxm_ep_dequeue_deferred_tx_queue(def_tx_entry);
 			free(def_tx_entry);
 			break;
+		case RXM_DEFERRED_TX_CREDIT_SEND:
+			iov.iov_base = &def_tx_entry->credit_msg.tx_buf->pkt;
+			iov.iov_len = sizeof(def_tx_entry->credit_msg.tx_buf->pkt);
+
+			msg.addr = 0;
+			msg.context = def_tx_entry->credit_msg.tx_buf;
+			msg.data = 0;
+			msg.desc = &def_tx_entry->credit_msg.tx_buf->hdr.desc;
+			msg.iov_count = 1;
+			msg.msg_iov = &iov;
+
+			ret = fi_sendmsg(def_tx_entry->rxm_conn->msg_ep, &msg,
+					 FI_PRIORITY);
+			if (ret) {
+				if (ret == -FI_EAGAIN)
+					break;
+				rxm_cq_write_error(
+					def_tx_entry->rxm_ep->util_ep.rx_cq,
+					def_tx_entry->rxm_ep->util_ep.rx_cntr,
+					def_tx_entry->rndv_read.rx_buf->
+						recv_entry->context, ret);
+				break;
+			}
+			rxm_ep_dequeue_deferred_tx_queue(def_tx_entry);
+			free(def_tx_entry);
+			break;
 		}
 	}
 }
 
-static ssize_t rxm_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg,
-			      uint64_t flags)
+static ssize_t
+rxm_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_send_common(rxm_ep, rxm_conn, msg->msg_iov, msg->desc,
-				  msg->iov_count, msg->context, msg->data,
-				  flags | rxm_ep->util_ep.tx_msg_flags, 0, ofi_op_msg,
-				  ((flags & FI_REMOTE_CQ_DATA) ?
-				   rxm_conn->inject_data_pkt : rxm_conn->inject_pkt));
+				 msg->iov_count, msg->context, msg->data,
+				 flags | rxm_ep->util_ep.tx_msg_flags, 0, ofi_op_msg,
+				 ((flags & FI_REMOTE_CQ_DATA) ?
+				 rxm_conn->inject_data_pkt : rxm_conn->inject_pkt));
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
 	return ret;
@@ -1607,18 +1677,18 @@ unlock:
 static ssize_t rxm_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len,
 			   void *desc, fi_addr_t dest_addr, void *context)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
+	struct rxm_ep *rxm_ep;
 	struct iovec iov = {
-		.iov_base = (void *)buf,
+		.iov_base = (void *) buf,
 		.iov_len = len,
 	};
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	ssize_t ret;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context,
@@ -1633,14 +1703,14 @@ static ssize_t rxm_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov,
 			    void **desc, size_t count, fi_addr_t dest_addr,
 			    void *context)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_send_common(rxm_ep, rxm_conn, iov, desc, count, context,
@@ -1654,14 +1724,14 @@ unlock:
 static ssize_t rxm_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len,
 			     fi_addr_t dest_addr)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len, 0,
@@ -1672,16 +1742,16 @@ unlock:
 	return ret;
 }
 
-static ssize_t rxm_ep_inject_fast(struct fid_ep *ep_fid, const void *buf, size_t len,
-				  fi_addr_t dest_addr)
+static ssize_t rxm_ep_inject_fast(struct fid_ep *ep_fid, const void *buf,
+				  size_t len, fi_addr_t dest_addr)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		return ret;
 
 	return rxm_ep_inject_send_fast(rxm_ep, rxm_conn, buf, len,
@@ -1692,18 +1762,18 @@ static ssize_t rxm_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t le
 			       void *desc, uint64_t data, fi_addr_t dest_addr,
 			       void *context)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
+	struct rxm_ep *rxm_ep;
 	struct iovec iov = {
-		.iov_base = (void *)buf,
+		.iov_base = (void *) buf,
 		.iov_len = len,
 	};
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	ssize_t ret;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, data,
@@ -1717,14 +1787,14 @@ unlock:
 static ssize_t rxm_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
 				 uint64_t data, fi_addr_t dest_addr)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len, data,
@@ -1738,13 +1808,13 @@ unlock:
 static ssize_t rxm_ep_injectdata_fast(struct fid_ep *ep_fid, const void *buf, size_t len,
 				      uint64_t data, fi_addr_t dest_addr)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		return ret;
 
 	rxm_conn->inject_data_pkt->hdr.data = data;
@@ -1808,7 +1878,7 @@ rxm_ep_post_trecv(struct rxm_ep *rxm_ep, const struct iovec *iov,
 	rx_buf->recv_entry = recv_entry;
 
 	if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg)
-		return rxm_cq_handle_rx_buf(rx_buf);
+		return rxm_handle_rx_buf(rx_buf);
 	else
 		return rxm_handle_unexp_sar(&rxm_ep->trecv_queue, recv_entry,
 					    rx_buf);
@@ -1829,24 +1899,26 @@ rxm_ep_trecv_common(struct rxm_ep *rxm_ep, const struct iovec *iov,
 	return ret;
 }
 
-static ssize_t rxm_ep_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg,
-			       uint64_t flags)
+static ssize_t
+rxm_ep_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg,
+		uint64_t flags)
 {
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
 	struct rxm_recv_entry *recv_entry;
 	struct fi_recv_context *recv_ctx;
 	struct rxm_rx_buf *rx_buf;
 	void *context = msg->context;
 	ssize_t ret = 0;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	flags |= rxm_ep->util_ep.rx_msg_flags;
 
 	if (!(flags & (FI_CLAIM | FI_PEEK)) &&
 	    !(rxm_ep->rxm_info->mode & FI_BUFFERED_RECV)) {
 		return rxm_ep_trecv_common(rxm_ep, msg->msg_iov, msg->desc,
 					   msg->iov_count, msg->addr,
-					   msg->tag, msg->ignore, context, flags);
+					   msg->tag, msg->ignore, context,
+					   flags);
 	}
 
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
@@ -1863,8 +1935,7 @@ static ssize_t rxm_ep_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged
 
 		assert(flags & FI_DISCARD);
 		FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Discarding buffered receive\n");
-		dlist_insert_tail(&rx_buf->repost_entry,
-				  &rx_buf->ep->repost_ready_list);
+		rxm_rx_buf_free(rx_buf);
 		goto unlock;
 	}
 
@@ -1898,7 +1969,7 @@ claim:
 		recv_entry->comp_flags |= FI_CLAIM;
 
 	rx_buf->recv_entry = recv_entry;
-	ret = rxm_cq_handle_rx_buf(rx_buf);
+	ret = rxm_handle_rx_buf(rx_buf);
 
 unlock:
 	ofi_ep_lock_release(&rxm_ep->util_ep);
@@ -1909,13 +1980,13 @@ static ssize_t rxm_ep_trecv(struct fid_ep *ep_fid, void *buf, size_t len,
 			    void *desc, fi_addr_t src_addr, uint64_t tag,
 			    uint64_t ignore, void *context)
 {
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
 	struct iovec iov = {
 		.iov_base	= buf,
 		.iov_len	= len,
 	};
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	return rxm_ep_trecv_common(rxm_ep, &iov, &desc, 1, src_addr, tag, ignore,
 				  context, rxm_ep->util_ep.rx_op_flags);
 }
@@ -1924,24 +1995,25 @@ static ssize_t rxm_ep_trecvv(struct fid_ep *ep_fid, const struct iovec *iov,
 			     void **desc, size_t count, fi_addr_t src_addr,
 			     uint64_t tag, uint64_t ignore, void *context)
 {
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
 
-	return rxm_ep_trecv_common(rxm_ep, iov, desc, count, src_addr, tag, ignore,
-				  context, rxm_ep->util_ep.rx_op_flags);
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
+	return rxm_ep_trecv_common(rxm_ep, iov, desc, count, src_addr, tag,
+				   ignore, context, rxm_ep->util_ep.rx_op_flags);
 }
 
-static ssize_t rxm_ep_tsendmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg,
-			       uint64_t flags)
+static ssize_t
+rxm_ep_tsendmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg,
+		uint64_t flags)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_send_common(rxm_ep, rxm_conn, msg->msg_iov, msg->desc,
@@ -1958,18 +2030,18 @@ static ssize_t rxm_ep_tsend(struct fid_ep *ep_fid, const void *buf, size_t len,
 			    void *desc, fi_addr_t dest_addr, uint64_t tag,
 			    void *context)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
+	struct rxm_ep *rxm_ep;
 	struct iovec iov = {
-		.iov_base = (void *)buf,
+		.iov_base = (void *) buf,
 		.iov_len = len,
 	};
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	ssize_t ret;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, 0,
@@ -1984,14 +2056,14 @@ static ssize_t rxm_ep_tsendv(struct fid_ep *ep_fid, const struct iovec *iov,
 			     void **desc, size_t count, fi_addr_t dest_addr,
 			     uint64_t tag, void *context)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_send_common(rxm_ep, rxm_conn, iov, desc, count, context, 0,
@@ -2005,14 +2077,14 @@ unlock:
 static ssize_t rxm_ep_tinject(struct fid_ep *ep_fid, const void *buf, size_t len,
 			      fi_addr_t dest_addr, uint64_t tag)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len, 0,
@@ -2026,13 +2098,13 @@ unlock:
 static ssize_t rxm_ep_tinject_fast(struct fid_ep *ep_fid, const void *buf, size_t len,
 				   fi_addr_t dest_addr, uint64_t tag)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		return ret;
 
 	rxm_conn->tinject_pkt->hdr.tag = tag;
@@ -2045,18 +2117,18 @@ static ssize_t rxm_ep_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t l
 				void *desc, uint64_t data, fi_addr_t dest_addr,
 				uint64_t tag, void *context)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
 	struct iovec iov = {
 		.iov_base = (void *)buf,
 		.iov_len = len,
 	};
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, data,
@@ -2070,14 +2142,14 @@ unlock:
 static ssize_t rxm_ep_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t len,
 				  uint64_t data, fi_addr_t dest_addr, uint64_t tag)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		goto unlock;
 
 	ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len, data,
@@ -2091,13 +2163,13 @@ unlock:
 static ssize_t rxm_ep_tinjectdata_fast(struct fid_ep *ep_fid, const void *buf, size_t len,
 				       uint64_t data, fi_addr_t dest_addr, uint64_t tag)
 {
-	int ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
-	if (OFI_UNLIKELY(ret))
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
+	if (ret)
 		return ret;
 
 	rxm_conn->tinject_data_pkt->hdr.tag = tag;
@@ -2203,9 +2275,9 @@ static int rxm_listener_close(struct rxm_ep *rxm_ep)
 static int rxm_ep_close(struct fid *fid)
 {
 	int ret, retv = 0;
-	struct rxm_ep *rxm_ep =
-		container_of(fid, struct rxm_ep, util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
 
+	rxm_ep = container_of(fid, struct rxm_ep, util_ep.ep_fid.fid);
 	if (rxm_ep->cmap)
 		rxm_cmap_free(rxm_ep->cmap);
 
@@ -2214,6 +2286,9 @@ static int rxm_ep_close(struct fid *fid)
 		retv = ret;
 
 	rxm_ep_txrx_res_close(rxm_ep);
+	ret = rxm_ep_msg_res_close(rxm_ep);
+	if (ret)
+		retv = ret;
 
 	if (rxm_ep->msg_cq) {
 		ret = fi_close(&rxm_ep->msg_cq->fid);
@@ -2223,10 +2298,6 @@ static int rxm_ep_close(struct fid *fid)
 		}
 	}
 
-	ret = rxm_ep_msg_res_close(rxm_ep);
-	if (ret)
-		retv = ret;
-
 	ofi_endpoint_close(&rxm_ep->util_ep);
 	fi_freeinfo(rxm_ep->rxm_info);
 	free(rxm_ep);
@@ -2289,6 +2360,19 @@ static int rxm_msg_cq_fd_needed(struct rxm_ep *rxm_ep)
 		(rxm_ep->util_ep.rem_rd_cntr && rxm_ep->util_ep.rem_rd_cntr->wait));
 }
 
+static enum fi_wait_obj rxm_get_wait_obj(struct rxm_ep *ep)
+{
+	if (!rxm_msg_cq_fd_needed(ep))
+		return FI_WAIT_NONE;
+
+	if ((def_tcp_wait_obj != FI_WAIT_UNSPEC) &&
+	    !strncasecmp(ep->msg_info->fabric_attr->prov_name, "tcp",
+			 strlen("tcp"))) {
+		return def_tcp_wait_obj;
+	}
+	return def_wait_obj;
+}
+
 static int rxm_ep_msg_cq_open(struct rxm_ep *rxm_ep)
 {
 	struct rxm_domain *rxm_domain;
@@ -2307,11 +2391,12 @@ static int rxm_ep_msg_cq_open(struct rxm_ep *rxm_ep)
 	};
 	int i, ret;
 
-	cq_attr.size = (rxm_ep->msg_info->tx_attr->size +
-			rxm_ep->msg_info->rx_attr->size) * rxm_def_univ_size;
+	cq_attr.size = rxm_ep->msg_info->rx_attr->size;
+	if (rxm_ep->msg_info->ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT)
+		cq_attr.size *= ofi_universe_size;
+	cq_attr.size += rxm_ep->msg_info->tx_attr->size * ofi_universe_size;
 	cq_attr.format = FI_CQ_FORMAT_DATA;
-	cq_attr.wait_obj = (rxm_msg_cq_fd_needed(rxm_ep) ?
-			    def_wait_obj : FI_WAIT_NONE);
+	cq_attr.wait_obj = rxm_get_wait_obj(rxm_ep);
 
 	rxm_domain = container_of(rxm_ep->util_ep.domain, struct rxm_domain,
 				  util_domain);
@@ -2589,18 +2674,20 @@ err:
 
 static int rxm_ep_msg_res_open(struct rxm_ep *rxm_ep)
 {
+	struct rxm_domain *rxm_domain;
 	int ret;
-	struct rxm_domain *rxm_domain =
-		container_of(rxm_ep->util_ep.domain, struct rxm_domain, util_domain);
 
+	rxm_domain = container_of(rxm_ep->util_ep.domain, struct rxm_domain,
+				  util_domain);
  	ret = ofi_get_core_info(rxm_ep->util_ep.domain->fabric->fabric_fid.api_version,
 				NULL, NULL, 0, &rxm_util_prov, rxm_ep->rxm_info,
-				rxm_info_to_core, &rxm_ep->msg_info);
+				NULL, rxm_info_to_core, &rxm_ep->msg_info);
 	if (ret)
 		return ret;
 
  	if (rxm_ep->msg_info->ep_attr->rx_ctx_cnt == FI_SHARED_CONTEXT) {
-		ret = fi_srx_context(rxm_domain->msg_domain, rxm_ep->msg_info->rx_attr,
+		ret = fi_srx_context(rxm_domain->msg_domain,
+				     rxm_ep->msg_info->rx_attr,
 				     &rxm_ep->srx_ctx, NULL);
 		if (ret) {
 			FI_WARN(&rxm_prov, FI_LOG_EP_CTRL,
@@ -2616,10 +2703,10 @@ static int rxm_ep_msg_res_open(struct rxm_ep *rxm_ep)
  	/* Zero out the port as we would be creating multiple MSG EPs for a single
 	 * RXM EP and we don't want address conflicts. */
 	if (rxm_ep->msg_info->src_addr) {
-		if (((struct sockaddr *)rxm_ep->msg_info->src_addr)->sa_family == AF_INET)
-			((struct sockaddr_in *)(rxm_ep->msg_info->src_addr))->sin_port = 0;
+		if (((struct sockaddr *) rxm_ep->msg_info->src_addr)->sa_family == AF_INET)
+			((struct sockaddr_in *) (rxm_ep->msg_info->src_addr))->sin_port = 0;
 		else
-			((struct sockaddr_in6 *)(rxm_ep->msg_info->src_addr))->sin6_port = 0;
+			((struct sockaddr_in6 *) (rxm_ep->msg_info->src_addr))->sin6_port = 0;
 	}
 
 	return 0;
@@ -2679,10 +2766,10 @@ int rxm_endpoint(struct fid_domain *domain, struct fi_info *info,
 
 	if(rxm_ep->rxm_info->caps & FI_COLLECTIVE) {
 		(*ep_fid)->collective = &rxm_ops_collective;
-		rxm_ep->txrx_ops = &rxm_coll_rx_ops;
+		rxm_ep->eager_ops = &coll_eager_ops;
 	} else {
 		(*ep_fid)->collective = &rxm_ops_collective_none;
-		rxm_ep->txrx_ops = &rxm_rx_ops;
+		rxm_ep->eager_ops = &def_eager_ops;
 	}
 
 	if (rxm_ep->util_ep.domain->threading != FI_THREAD_SAFE) {
diff --git a/deps/libfabric/prov/rxm/src/rxm_init.c b/deps/libfabric/prov/rxm/src/rxm_init.c
index f367044617ee2d7f65f649ac3d63d4da21063f74..b3e3139d335829995a5c2ede40434fd9bc821b79 100644
--- a/deps/libfabric/prov/rxm/src/rxm_init.c
+++ b/deps/libfabric/prov/rxm/src/rxm_init.c
@@ -37,11 +37,10 @@
 
 #include <ofi_prov.h>
 #include "rxm.h"
+#include "ofi_coll.h"
 
-#define RXM_ATOMIC_UNSUPPORTED_MSG_ORDER (FI_ORDER_RAW | FI_ORDER_ATOMIC_RAW | \
-					  FI_ORDER_RAR | FI_ORDER_ATOMIC_RAR | \
-					  FI_ORDER_WAW | FI_ORDER_ATOMIC_WAW | \
-					  FI_ORDER_WAR | FI_ORDER_ATOMIC_WAR | \
+#define RXM_ATOMIC_UNSUPPORTED_MSG_ORDER (FI_ORDER_RAW | FI_ORDER_RAR |  \
+					  FI_ORDER_WAW | FI_ORDER_WAR |  \
 					  FI_ORDER_SAR | FI_ORDER_SAW)
 
 #define RXM_PASSTHRU_CAPS (FI_MSG | FI_RMA | FI_SEND | FI_RECV |	\
@@ -50,10 +49,9 @@
 
 size_t rxm_msg_tx_size		= 128;
 size_t rxm_msg_rx_size		= 128;
-size_t rxm_def_univ_size	= 256;
 size_t rxm_eager_limit		= RXM_BUF_SIZE - sizeof(struct rxm_pkt);
 int force_auto_progress		= 0;
-enum fi_wait_obj def_wait_obj	= FI_WAIT_FD;
+enum fi_wait_obj def_wait_obj = FI_WAIT_FD, def_tcp_wait_obj = FI_WAIT_UNSPEC;
 
 char *rxm_proto_state_str[] = {
 	RXM_PROTO_STATES(OFI_STR)
@@ -90,9 +88,9 @@ void rxm_info_to_core_mr_modes(uint32_t version, const struct fi_info *hints,
 }
 
 int rxm_info_to_core(uint32_t version, const struct fi_info *hints,
-		     struct fi_info *core_info)
+		     const struct fi_info *base_info, struct fi_info *core_info)
 {
-	int use_srx = 0;
+	int ret, use_srx = 0;
 
 	rxm_info_to_core_mr_modes(version, hints, core_info);
 
@@ -124,8 +122,13 @@ int rxm_info_to_core(uint32_t version, const struct fi_info *hints,
 			core_info->rx_attr->comp_order = hints->rx_attr->comp_order;
 		}
 	}
+
 	core_info->ep_attr->type = FI_EP_MSG;
-	if (!fi_param_get_bool(&rxm_prov, "use_srx", &use_srx) && use_srx) {
+
+	ret = fi_param_get_bool(&rxm_prov, "use_srx", &use_srx);
+	if (use_srx || ((ret == -FI_ENODATA) && base_info &&
+	    base_info->fabric_attr->prov_name &&
+	    !strcmp(base_info->fabric_attr->prov_name, "tcp"))) {
 		FI_DBG(&rxm_prov, FI_LOG_FABRIC,
 		       "Requesting shared receive context from core provider\n");
 		core_info->ep_attr->rx_ctx_cnt = FI_SHARED_CONTEXT;
@@ -141,41 +144,41 @@ int rxm_info_to_core(uint32_t version, const struct fi_info *hints,
 }
 
 int rxm_info_to_rxm(uint32_t version, const struct fi_info *core_info,
-		    struct fi_info *info)
+		    const struct fi_info *base_info, struct fi_info *info)
 {
-	info->caps = rxm_info.caps;
+	info->caps = base_info->caps;
 	// TODO find which other modes should be filtered
-	info->mode = (core_info->mode & ~FI_RX_CQ_DATA) | rxm_info.mode;
+	info->mode = (core_info->mode & ~FI_RX_CQ_DATA) | base_info->mode;
 
-	info->tx_attr->caps		= rxm_info.tx_attr->caps;
+	info->tx_attr->caps		= base_info->tx_attr->caps;
 	info->tx_attr->mode		= info->mode;
 	info->tx_attr->msg_order 	= core_info->tx_attr->msg_order;
-	info->tx_attr->comp_order 	= rxm_info.tx_attr->comp_order;
-	info->tx_attr->inject_size	= rxm_info.tx_attr->inject_size;
-	info->tx_attr->size 		= rxm_info.tx_attr->size;
-	info->tx_attr->iov_limit 	= MIN(rxm_info.tx_attr->iov_limit,
+	info->tx_attr->comp_order 	= base_info->tx_attr->comp_order;
+	info->tx_attr->inject_size	= base_info->tx_attr->inject_size;
+	info->tx_attr->size 		= base_info->tx_attr->size;
+	info->tx_attr->iov_limit 	= MIN(base_info->tx_attr->iov_limit,
 					      core_info->tx_attr->iov_limit);
-	info->tx_attr->rma_iov_limit	= MIN(rxm_info.tx_attr->rma_iov_limit,
+	info->tx_attr->rma_iov_limit	= MIN(base_info->tx_attr->rma_iov_limit,
 					      core_info->tx_attr->rma_iov_limit);
 
-	info->rx_attr->caps		= rxm_info.rx_attr->caps;
+	info->rx_attr->caps		= base_info->rx_attr->caps;
 	info->rx_attr->mode		= info->rx_attr->mode & ~FI_RX_CQ_DATA;
 	info->rx_attr->msg_order 	= core_info->rx_attr->msg_order;
-	info->rx_attr->comp_order 	= rxm_info.rx_attr->comp_order;
-	info->rx_attr->size 		= rxm_info.rx_attr->size;
-	info->rx_attr->iov_limit 	= MIN(rxm_info.rx_attr->iov_limit,
+	info->rx_attr->comp_order 	= base_info->rx_attr->comp_order;
+	info->rx_attr->size 		= base_info->rx_attr->size;
+	info->rx_attr->iov_limit 	= MIN(base_info->rx_attr->iov_limit,
 					      core_info->rx_attr->iov_limit);
 
-	*info->ep_attr = *rxm_info.ep_attr;
+	*info->ep_attr = *base_info->ep_attr;
 	info->ep_attr->max_msg_size = core_info->ep_attr->max_msg_size;
 	info->ep_attr->max_order_raw_size = core_info->ep_attr->max_order_raw_size;
 	info->ep_attr->max_order_war_size = core_info->ep_attr->max_order_war_size;
 	info->ep_attr->max_order_waw_size = core_info->ep_attr->max_order_waw_size;
 
-	*info->domain_attr = *rxm_info.domain_attr;
+	*info->domain_attr = *base_info->domain_attr;
 	info->domain_attr->mr_mode |= core_info->domain_attr->mr_mode;
 	info->domain_attr->cq_data_size = MIN(core_info->domain_attr->cq_data_size,
-					      rxm_info.domain_attr->cq_data_size);
+					      base_info->domain_attr->cq_data_size);
 	info->domain_attr->mr_key_size = core_info->domain_attr->mr_key_size;
 
 	if (core_info->nic) {
@@ -187,22 +190,33 @@ int rxm_info_to_rxm(uint32_t version, const struct fi_info *core_info,
 	return 0;
 }
 
-static int rxm_init_info(void)
+static void rxm_init_infos(void)
 {
-	size_t param;
+	struct fi_info *cur;
+	size_t buf_size, tx_size = 0, rx_size = 0;
 
-	if (!fi_param_get_size_t(&rxm_prov, "buffer_size", &param)) {
-		if (param < sizeof(struct rxm_pkt) + sizeof(struct rxm_rndv_hdr)) {
+	if (!fi_param_get_size_t(&rxm_prov, "buffer_size", &buf_size)) {
+		if (buf_size <
+		    sizeof(struct rxm_pkt) + sizeof(struct rxm_rndv_hdr)) {
 			FI_WARN(&rxm_prov, FI_LOG_CORE,
 				"Requested buffer size too small\n");
-			return -FI_EINVAL;
+			buf_size = sizeof(struct rxm_pkt) +
+				   sizeof(struct rxm_rndv_hdr);
 		}
 
-		rxm_eager_limit = param - sizeof(struct rxm_pkt);
+		rxm_eager_limit = buf_size - sizeof(struct rxm_pkt);
+	}
+
+	fi_param_get_size_t(&rxm_prov, "tx_size", &tx_size);
+	fi_param_get_size_t(&rxm_prov, "rx_size", &rx_size);
+
+	for (cur = (struct fi_info *) rxm_util_prov.info; cur; cur = cur->next) {
+		cur->tx_attr->inject_size = rxm_eager_limit;
+		if (tx_size)
+			cur->tx_attr->size = tx_size;
+		if (rx_size)
+			cur->rx_attr->size = rx_size;
 	}
-	rxm_info.tx_attr->inject_size = rxm_eager_limit;
-	rxm_util_prov.info = &rxm_info;
-	return 0;
 }
 
 static void rxm_alter_info(const struct fi_info *hints, struct fi_info *info)
@@ -264,7 +278,7 @@ static void rxm_alter_info(const struct fi_info *hints, struct fi_info *info)
 				cur->domain_attr->data_progress = FI_PROGRESS_MANUAL;
 
 			if (hints->ep_attr && hints->ep_attr->mem_tag_format &&
-			    (info->caps & FI_TAGGED)) {
+			    (info->caps & (FI_TAGGED | FI_COLLECTIVE))) {
 				FI_INFO(&rxm_prov, FI_LOG_CORE,
 					"mem_tag_format requested: 0x%" PRIx64
 					" (note: provider doesn't optimize "
@@ -274,6 +288,7 @@ static void rxm_alter_info(const struct fi_info *hints, struct fi_info *info)
 					hints->ep_attr->mem_tag_format;
 			}
 		}
+
 		if (cur->domain_attr->data_progress == FI_PROGRESS_AUTO ||
 		    force_auto_progress)
 			cur->domain_attr->threading = FI_THREAD_SAFE;
@@ -361,13 +376,30 @@ struct fi_provider rxm_prov = {
 	.cleanup = rxm_fini
 };
 
-static void rxm_param_get_def_wait(void)
+static void rxm_get_def_wait(void)
 {
 	char *wait_str = NULL;
 
+	fi_param_define(&rxm_prov, "def_wait_obj", FI_PARAM_STRING,
+			"Specifies the default wait object used for blocking "
+			"operations (e.g. fi_cq_sread).  Supported values "
+			"are: fd and pollfd (default: fd).");
+
+	fi_param_define(&rxm_prov, "def_tcp_wait_obj", FI_PARAM_STRING,
+			"See def_wait_obj for description.  If set, this "
+			"overrides the def_wait_obj when running over the "
+			"tcp provider.");
+
 	fi_param_get_str(&rxm_prov, "def_wait_obj", &wait_str);
 	if (wait_str && !strcasecmp(wait_str, "pollfd"))
 		def_wait_obj = FI_WAIT_POLLFD;
+
+	wait_str = NULL;
+	fi_param_get_str(&rxm_prov, "def_tcp_wait_obj", &wait_str);
+	if (wait_str) {
+		def_tcp_wait_obj = (!strcasecmp(wait_str, "pollfd")) ?
+				   FI_WAIT_POLLFD : FI_WAIT_FD;
+	}
 }
 
 RXM_INI
@@ -438,16 +470,9 @@ RXM_INI
 			"Force auto-progress for data transfers even if app "
 			"requested manual progress (default: false/no).");
 
-	fi_param_define(&rxm_prov, "def_wait_obj", FI_PARAM_STRING,
-			"Specifies the default wait object used for blocking "
-			"operations (e.g. fi_cq_sread).  Supported values "
-			"are: fd and pollfd (default: fd).");
-
-	fi_param_get_size_t(&rxm_prov, "tx_size", &rxm_info.tx_attr->size);
-	fi_param_get_size_t(&rxm_prov, "rx_size", &rxm_info.rx_attr->size);
+	rxm_init_infos();
 	fi_param_get_size_t(&rxm_prov, "msg_tx_size", &rxm_msg_tx_size);
 	fi_param_get_size_t(&rxm_prov, "msg_rx_size", &rxm_msg_rx_size);
-	fi_param_get_size_t(NULL, "universe_size", &rxm_def_univ_size);
 	if (fi_param_get_int(&rxm_prov, "cm_progress_interval",
 				(int *) &rxm_cm_progress_interval))
 		rxm_cm_progress_interval = 10000;
@@ -455,17 +480,12 @@ RXM_INI
 				(int *) &rxm_cq_eq_fairness))
 		rxm_cq_eq_fairness = 128;
 	fi_param_get_bool(&rxm_prov, "data_auto_progress", &force_auto_progress);
-	rxm_param_get_def_wait();
+	rxm_get_def_wait();
 
 	if (force_auto_progress)
 		FI_INFO(&rxm_prov, FI_LOG_CORE, "auto-progress for data requested "
 			"(FI_OFI_RXM_DATA_AUTO_PROGRESS = 1), domain threading "
 			"level would be set to FI_THREAD_SAFE\n");
 
-	if (rxm_init_info()) {
-		FI_WARN(&rxm_prov, FI_LOG_CORE, "Unable to initialize rxm_info\n");
-		return NULL;
-	}
-
 	return &rxm_prov;
 }
diff --git a/deps/libfabric/prov/rxm/src/rxm_rma.c b/deps/libfabric/prov/rxm/src/rxm_rma.c
index fbb554f9f88821054f190cb6c0cd255f79105fde..c665b7b1b950c9d3bdabaeefb484caa9a049a192 100644
--- a/deps/libfabric/prov/rxm/src/rxm_rma.c
+++ b/deps/libfabric/prov/rxm/src/rxm_rma.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Intel Corporation. All rights reserved.
+ * Copyright (c) 2017-2020 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -32,13 +32,11 @@
 
 #include "rxm.h"
 
-typedef ssize_t rxm_rma_msg_fn(struct fid_ep *ep_fid,
-			       const struct fi_msg_rma *msg, uint64_t flags);
 
-static inline ssize_t
+static ssize_t
 rxm_ep_rma_reg_iov(struct rxm_ep *rxm_ep, const struct iovec *msg_iov,
 		   void **desc, void **desc_storage, size_t iov_count,
-		   uint64_t comp_flags, struct rxm_rma_buf *rma_buf)
+		   uint64_t access, struct rxm_rma_buf *rma_buf)
 {
 	size_t i, ret;
 
@@ -47,7 +45,7 @@ rxm_ep_rma_reg_iov(struct rxm_ep *rxm_ep, const struct iovec *msg_iov,
 
 	if (!rxm_ep->rdm_mr_local) {
 		ret = rxm_msg_mr_regv(rxm_ep, msg_iov, iov_count, SIZE_MAX,
-				      comp_flags, rma_buf->mr.mr);
+				      access, rma_buf->mr.mr);
 		if (OFI_UNLIKELY(ret))
 			return ret;
 
@@ -61,26 +59,28 @@ rxm_ep_rma_reg_iov(struct rxm_ep *rxm_ep, const struct iovec *msg_iov,
 	return FI_SUCCESS;
 }
 
-static inline ssize_t
-rxm_ep_rma_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, uint64_t flags,
-		  rxm_rma_msg_fn rma_msg, uint64_t comp_flags)
+static ssize_t
+rxm_ep_rma_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg,
+		  uint64_t flags, ssize_t (*rma_msg)(struct fid_ep *ep_fid,
+		  const struct fi_msg_rma *msg, uint64_t flags),
+		  uint64_t comp_flags)
 {
 	struct rxm_rma_buf *rma_buf;
 	struct fi_msg_rma msg_rma = *msg;
 	struct rxm_conn *rxm_conn;
 	void *mr_desc[RXM_IOV_LIMIT] = { 0 };
-	int ret;
+	ssize_t ret;
 
 	assert(msg->rma_iov_count <= rxm_ep->rxm_info->tx_attr->rma_iov_limit);
 
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
 
-	ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn);
+	ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn);
 	if (OFI_UNLIKELY(ret))
 		goto unlock;
 
-	rma_buf = rxm_rma_buf_alloc(rxm_ep);
-	if (OFI_UNLIKELY(!rma_buf)) {
+	rma_buf = ofi_buf_alloc(rxm_ep->buf_pools[RXM_BUF_POOL_RMA].pool);
+	if (!rma_buf) {
 		ret = -FI_EAGAIN;
 		goto unlock;
 	}
@@ -89,8 +89,8 @@ rxm_ep_rma_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, uint64_t
 	rma_buf->flags = flags;
 
 	ret = rxm_ep_rma_reg_iov(rxm_ep, msg_rma.msg_iov, msg_rma.desc, mr_desc,
-				 msg_rma.iov_count, comp_flags & (FI_WRITE | FI_READ),
-				 rma_buf);
+				 msg_rma.iov_count,
+				 comp_flags & (FI_WRITE | FI_READ), rma_buf);
 	if (OFI_UNLIKELY(ret))
 		goto release;
 
@@ -110,12 +110,13 @@ unlock:
 	return ret;
 }
 
-static inline ssize_t
-rxm_ep_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags)
+static ssize_t
+rxm_ep_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
+	       uint64_t flags)
 {
-	struct rxm_ep *rxm_ep =
-		container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	return rxm_ep_rma_common(rxm_ep, msg, flags | rxm_ep->util_ep.tx_msg_flags,
 				 fi_readmsg, FI_READ);
 }
@@ -169,14 +170,14 @@ static ssize_t rxm_ep_read(struct fid_ep *ep_fid, void *buf, size_t len,
 		.context = context,
 		.data = 0,
 	};
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	return rxm_ep_rma_common(rxm_ep, &msg, rxm_ep->util_ep.tx_op_flags,
 				 fi_readmsg, FI_READ);
 }
 
-static inline void
+static void
 rxm_ep_format_rma_msg(struct rxm_rma_buf *rma_buf, const struct fi_msg_rma *orig_msg,
 		      struct iovec *rxm_iov, struct fi_msg_rma *rxm_msg)
 {
@@ -196,9 +197,10 @@ rxm_ep_format_rma_msg(struct rxm_rma_buf *rma_buf, const struct fi_msg_rma *orig
 	rxm_msg->rma_iov_count = orig_msg->rma_iov_count;
 }
 
-static inline ssize_t
-rxm_ep_rma_emulate_inject_msg(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, size_t total_size,
-			      const struct fi_msg_rma *msg, uint64_t flags)
+static ssize_t
+rxm_ep_rma_emulate_inject_msg(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
+			      size_t total_size, const struct fi_msg_rma *msg,
+			      uint64_t flags)
 {
 	struct rxm_rma_buf *rma_buf;
 	ssize_t ret;
@@ -207,8 +209,8 @@ rxm_ep_rma_emulate_inject_msg(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 
 	assert(msg->rma_iov_count <= rxm_ep->rxm_info->tx_attr->rma_iov_limit);
 
-	rma_buf = rxm_rma_buf_alloc(rxm_ep);
-	if (OFI_UNLIKELY(!rma_buf))
+	rma_buf = ofi_buf_alloc(rxm_ep->buf_pools[RXM_BUF_POOL_RMA].pool);
+	if (!rma_buf)
 		return -FI_EAGAIN;
 
 	rma_buf->pkt.hdr.size = total_size;
@@ -227,7 +229,7 @@ rxm_ep_rma_emulate_inject_msg(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 	return ret;
 }
 
-static inline ssize_t
+static ssize_t
 rxm_ep_rma_emulate_inject(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 			  const void *buf, size_t len, uint64_t data,
 			  fi_addr_t dest_addr, uint64_t addr, uint64_t key,
@@ -256,8 +258,9 @@ rxm_ep_rma_emulate_inject(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn,
 	return rxm_ep_rma_emulate_inject_msg(rxm_ep, rxm_conn, len, &msg, flags);
 }
 
-static inline ssize_t
-rxm_ep_rma_inject_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, uint64_t flags)
+static ssize_t
+rxm_ep_rma_inject_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg,
+			 uint64_t flags)
 {
 	struct rxm_conn *rxm_conn;
 	size_t total_size = ofi_total_iov_len(msg->msg_iov, msg->iov_count);
@@ -267,7 +270,7 @@ rxm_ep_rma_inject_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, ui
 
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
 
-	ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn);
+	ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn);
 	if (OFI_UNLIKELY(ret))
 		goto unlock;
 
@@ -275,8 +278,8 @@ rxm_ep_rma_inject_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, ui
 	    rxm_ep->util_ep.wr_cntr ||
 	    (flags & FI_COMPLETION) || (msg->iov_count > 1) ||
 	    (msg->rma_iov_count > 1)) {
-		ret = rxm_ep_rma_emulate_inject_msg(rxm_ep, rxm_conn, total_size,
-						    msg, flags);
+		ret = rxm_ep_rma_emulate_inject_msg(rxm_ep, rxm_conn,
+						    total_size, msg, flags);
 		goto unlock;
 	}
 
@@ -303,13 +306,13 @@ unlock:
 	return ret;
 }
 
-static inline ssize_t
+static ssize_t
 rxm_ep_generic_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
 			uint64_t flags)
 {
-	struct rxm_ep *rxm_ep =
-		container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	if (flags & FI_INJECT)
 		return rxm_ep_rma_inject_common(rxm_ep, msg, flags);
 	else
@@ -317,13 +320,15 @@ rxm_ep_generic_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
 					 fi_writemsg, FI_WRITE);
 }
 
-static inline ssize_t
-rxm_ep_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags)
+static ssize_t
+rxm_ep_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
+		uint64_t flags)
 {
-	struct rxm_ep *rxm_ep =
-		container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
 
-	return rxm_ep_generic_writemsg(ep_fid, msg, flags | rxm_ep->util_ep.tx_msg_flags);
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
+	return rxm_ep_generic_writemsg(ep_fid, msg, flags |
+				       rxm_ep->util_ep.tx_msg_flags);
 }
 
 static ssize_t rxm_ep_writev(struct fid_ep *ep_fid, const struct iovec *iov,
@@ -345,10 +350,11 @@ static ssize_t rxm_ep_writev(struct fid_ep *ep_fid, const struct iovec *iov,
 		.context = context,
 		.data = 0,
 	};
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
 
-	return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep->util_ep.tx_op_flags);
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
+	return rxm_ep_generic_writemsg(ep_fid, &msg,
+				       rxm_ep->util_ep.tx_op_flags);
 }
 
 static ssize_t rxm_ep_writedata(struct fid_ep *ep_fid, const void *buf,
@@ -375,9 +381,9 @@ static ssize_t rxm_ep_writedata(struct fid_ep *ep_fid, const void *buf,
 		.context = context,
 		.data = data,
 	};
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep->util_ep.tx_op_flags |
 				       FI_REMOTE_CQ_DATA);
 }
@@ -405,9 +411,9 @@ static ssize_t rxm_ep_write(struct fid_ep *ep_fid, const void *buf,
 		.context = context,
 		.data = 0,
 	};
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep->util_ep.tx_op_flags);
 }
 
@@ -415,22 +421,22 @@ static ssize_t rxm_ep_inject_write(struct fid_ep *ep_fid, const void *buf,
 				   size_t len, fi_addr_t dest_addr,
 				   uint64_t addr, uint64_t key)
 {
-	ssize_t ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
 
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
 
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
 	if (OFI_UNLIKELY(ret))
 		goto unlock;
 
 	if (len > rxm_ep->msg_info->tx_attr->inject_size ||
 	    rxm_ep->util_ep.wr_cntr) {
-		ret = rxm_ep_rma_emulate_inject(
-			rxm_ep, rxm_conn, buf, len, 0,
-			dest_addr, addr, key, FI_INJECT);
+		ret = rxm_ep_rma_emulate_inject(rxm_ep, rxm_conn, buf, len, 0,
+						dest_addr, addr, key,
+						FI_INJECT);
 		goto unlock;
 	}
 
@@ -450,13 +456,14 @@ static ssize_t rxm_ep_inject_writedata(struct fid_ep *ep_fid, const void *buf,
 				       fi_addr_t dest_addr, uint64_t addr,
 				       uint64_t key)
 {
-	ssize_t ret;
 	struct rxm_conn *rxm_conn;
-	struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep,
-					     util_ep.ep_fid.fid);
+	struct rxm_ep *rxm_ep;
+	ssize_t ret;
+
+	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);
 	ofi_ep_lock_acquire(&rxm_ep->util_ep);
 
-	ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn);
+	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
 	if (OFI_UNLIKELY(ret))
 		goto unlock;
 
diff --git a/deps/libfabric/prov/shm/src/smr.h b/deps/libfabric/prov/shm/src/smr.h
index b5ab31580c0fa30e6b114ce5013fcdd0c8336c5f..f6a494b8fab0660faf723f0b30968637e0af6af5 100644
--- a/deps/libfabric/prov/shm/src/smr.h
+++ b/deps/libfabric/prov/shm/src/smr.h
@@ -60,6 +60,7 @@
 #include <ofi_signal.h>
 #include <ofi_util.h>
 #include <ofi_atomic.h>
+#include <ofi_iov.h>
 
 #ifndef _SMR_H_
 #define _SMR_H_
@@ -82,6 +83,11 @@ struct smr_av {
 	size_t			used;
 };
 
+static inline int64_t smr_addr_lookup(struct util_av *av, fi_addr_t fiaddr)
+{
+	return *((int64_t *) ofi_av_get_addr(av, fiaddr));
+}
+
 int smr_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 		struct fid_domain **dom, void *context);
 
@@ -99,18 +105,20 @@ int smr_query_atomic(struct fid_domain *domain, enum fi_datatype datatype,
 struct smr_rx_entry {
 	struct dlist_entry	entry;
 	void			*context;
-	fi_addr_t		addr;
+	int64_t			peer_id;
 	uint64_t		tag;
 	uint64_t		ignore;
 	struct iovec		iov[SMR_IOV_LIMIT];
 	uint32_t		iov_count;
 	uint16_t		flags;
 	uint64_t		err;
+	enum fi_hmem_iface	iface;
+	uint64_t		device;
 };
 
 struct smr_tx_entry {
 	struct smr_cmd	cmd;
-	fi_addr_t	addr;
+	int64_t		peer_id;
 	void		*context;
 	struct iovec	iov[SMR_IOV_LIMIT];
 	uint32_t	iov_count;
@@ -118,6 +126,8 @@ struct smr_tx_entry {
 	int		next;
 	void		*map_ptr;
 	struct smr_ep_name *map_name;
+	enum fi_hmem_iface	iface;
+	uint64_t		device;
 };
 
 struct smr_sar_entry {
@@ -128,6 +138,8 @@ struct smr_sar_entry {
 	int			next;
 	struct iovec		iov[SMR_IOV_LIMIT];
 	size_t			iov_count;
+	enum fi_hmem_iface	iface;
+	uint64_t		device;
 };
 
 struct smr_ep;
@@ -139,15 +151,14 @@ typedef int (*smr_tx_comp_func)(struct smr_ep *ep, void *context, uint32_t op,
 
 
 struct smr_match_attr {
-	fi_addr_t	addr;
+	int64_t		id;
 	uint64_t	tag;
 	uint64_t	ignore;
 };
 
-static inline int smr_match_addr(fi_addr_t addr, fi_addr_t match_addr)
+static inline int smr_match_id(int64_t id, int64_t match_id)
 {
-	return (addr == FI_ADDR_UNSPEC) || (match_addr == FI_ADDR_UNSPEC) ||
-		(addr == match_addr);
+	return (id == -1) || (match_id == -1) || (id == match_id);
 }
 
 static inline int smr_match_tag(uint64_t tag, uint64_t ignore, uint64_t match_tag)
@@ -155,6 +166,18 @@ static inline int smr_match_tag(uint64_t tag, uint64_t ignore, uint64_t match_ta
 	return ((tag | ignore) == (match_tag | ignore));
 }
 
+static inline enum fi_hmem_iface smr_get_mr_hmem_iface(struct util_domain *domain,
+				void **desc, uint64_t *device)
+{
+	if (!(domain->mr_mode & FI_MR_HMEM) || !desc || !*desc) {
+		*device = 0;
+		return FI_HMEM_SYSTEM;
+	}
+
+	*device = ((struct ofi_mr *) *desc)->device;
+	return ((struct ofi_mr *) *desc)->iface;
+}
+
 struct smr_unexp_msg {
 	struct dlist_entry entry;
 	struct smr_cmd cmd;
@@ -217,7 +240,7 @@ struct smr_ep {
 	size_t			min_multi_recv_size;
 	const char		*name;
 	uint64_t		msg_id;
-	struct smr_region	*region;
+	struct smr_region	*volatile region;
 	struct smr_recv_fs	*recv_fs; /* protected by rx_cq lock */
 	struct smr_queue	recv_queue;
 	struct smr_queue	trecv_queue;
@@ -235,7 +258,7 @@ struct smr_ep {
 static inline int smr_mmap_name(char *shm_name, const char *ep_name,
 				uint64_t msg_id)
 {
-	return snprintf(shm_name, NAME_MAX - 1, "%s_%ld",
+	return snprintf(shm_name, SMR_NAME_MAX - 1, "%s_%ld",
 			ep_name, msg_id);
 }
 
@@ -247,34 +270,37 @@ int smr_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
 int smr_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr,
 		  struct fid_cntr **cntr_fid, void *context);
 
-int smr_verify_peer(struct smr_ep *ep, int peer_id);
+int64_t smr_verify_peer(struct smr_ep *ep, fi_addr_t fi_addr);
 
 void smr_format_pend_resp(struct smr_tx_entry *pend, struct smr_cmd *cmd,
-			  void *context, const struct iovec *iov,
-			  uint32_t iov_count, fi_addr_t id,
-			  struct smr_resp *resp);
-void smr_generic_format(struct smr_cmd *cmd, fi_addr_t peer_id, uint32_t op,
+			  void *context, enum fi_hmem_iface iface, uint64_t device,
+			  const struct iovec *iov, uint32_t iov_count,
+			  int64_t id, struct smr_resp *resp);
+void smr_generic_format(struct smr_cmd *cmd, int64_t peer_id, uint32_t op,
 			uint64_t tag, uint64_t data, uint64_t op_flags);
-void smr_format_inline(struct smr_cmd *cmd, const struct iovec *iov,
-		       size_t count);
-void smr_format_inject(struct smr_cmd *cmd, const struct iovec *iov,
-		       size_t count, struct smr_region *smr,
-		       struct smr_inject_buf *tx_buf);
+void smr_format_inline(struct smr_cmd *cmd, enum fi_hmem_iface iface,
+		       uint64_t device, const struct iovec *iov, size_t count);
+void smr_format_inject(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t device,
+		       const struct iovec *iov, size_t count,
+		       struct smr_region *smr, struct smr_inject_buf *tx_buf);
 void smr_format_iov(struct smr_cmd *cmd, const struct iovec *iov, size_t count,
 		    size_t total_len, struct smr_region *smr,
 		    struct smr_resp *resp);
 int smr_format_mmap(struct smr_ep *ep, struct smr_cmd *cmd,
 		    const struct iovec *iov, size_t count, size_t total_len,
 		    struct smr_tx_entry *pend, struct smr_resp *resp);
-void smr_format_sar(struct smr_cmd *cmd, const struct iovec *iov, size_t count,
+void smr_format_sar(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t deivce,
+		    const struct iovec *iov, size_t count,
 		    size_t total_len, struct smr_region *smr,
 		    struct smr_region *peer_smr, struct smr_sar_msg *sar_msg,
 		    struct smr_tx_entry *pending, struct smr_resp *resp);
 size_t smr_copy_to_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp,
-		       struct smr_cmd *cmd, const struct iovec *iov, size_t count,
+		       struct smr_cmd *cmd, enum fi_hmem_iface,
+		       uint64_t device, const struct iovec *iov, size_t count,
 		       size_t *bytes_done, int *next);
 size_t smr_copy_from_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp,
-			 struct smr_cmd *cmd, const struct iovec *iov, size_t count,
+			 struct smr_cmd *cmd, enum fi_hmem_iface iface,
+			 uint64_t device, const struct iovec *iov, size_t count,
 			 size_t *bytes_done, int *next);
 
 int smr_complete_tx(struct smr_ep *ep, void *context, uint32_t op,
@@ -284,7 +310,7 @@ int smr_tx_comp(struct smr_ep *ep, void *context, uint32_t op,
 int smr_tx_comp_signal(struct smr_ep *ep, void *context, uint32_t op,
 		uint16_t flags, uint64_t err);
 int smr_complete_rx(struct smr_ep *ep, void *context, uint32_t op,
-		uint16_t flags, size_t len, void *buf, fi_addr_t addr,
+		uint16_t flags, size_t len, void *buf, int64_t id,
 		uint64_t tag, uint64_t data, uint64_t err);
 int smr_rx_comp(struct smr_ep *ep, void *context, uint32_t op,
 		uint16_t flags, size_t len, void *buf, fi_addr_t addr,
@@ -303,6 +329,42 @@ uint64_t smr_rx_cq_flags(uint32_t op, uint16_t op_flags);
 
 void smr_ep_progress(struct util_ep *util_ep);
 
+static inline bool smr_cma_enabled(struct smr_ep *ep,
+				   struct smr_region *peer_smr)
+{
+	return ep->region->cma_cap == SMR_CMA_CAP_ON ||
+	       ep->region == peer_smr;
+}
+
+static inline int smr_cma_loop(pid_t pid, struct iovec *local,
+			unsigned long local_cnt, struct iovec *remote,
+			unsigned long remote_cnt, unsigned long flags,
+			size_t total, bool write)
+{
+	ssize_t ret;
+
+	while (1) {
+		if (write)
+			ret = ofi_process_vm_writev(pid, local, local_cnt, remote,
+						    remote_cnt, flags);
+		else
+			ret = ofi_process_vm_readv(pid, local, local_cnt, remote,
+						   remote_cnt, flags);
+		if (ret < 0) {
+			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
+				"CMA error %d\n", errno);
+			return -FI_EIO;
+		}
+
+		total -= ret;
+		if (!total)
+			return FI_SUCCESS;
+
+		ofi_consume_iov(local, &local_cnt, (size_t) ret);
+		ofi_consume_iov(remote, &remote_cnt, (size_t) ret);
+	}
+}
+
 int smr_progress_unexp_queue(struct smr_ep *ep, struct smr_rx_entry *entry,
 			     struct smr_queue *unexp_queue);
 
diff --git a/deps/libfabric/prov/shm/src/smr_atomic.c b/deps/libfabric/prov/shm/src/smr_atomic.c
index 9af422bc3132c28d5a40b2412b30118813643178..aaa305ea28a84edb672b174e43c05dd0da899de7 100644
--- a/deps/libfabric/prov/shm/src/smr_atomic.c
+++ b/deps/libfabric/prov/shm/src/smr_atomic.c
@@ -35,6 +35,7 @@
 #include <sys/uio.h>
 
 #include "ofi_iov.h"
+#include "ofi_hmem.h"
 #include "smr.h"
 
 
@@ -53,6 +54,7 @@ static void smr_generic_atomic_format(struct smr_cmd *cmd, uint8_t datatype,
 }
 
 static void smr_format_inline_atomic(struct smr_cmd *cmd,
+				     enum fi_hmem_iface iface, uint64_t device,
 				     const struct iovec *iov, size_t count,
 				     const struct iovec *compv,
 				     size_t comp_count)
@@ -64,15 +66,17 @@ static void smr_format_inline_atomic(struct smr_cmd *cmd,
 	switch (cmd->msg.hdr.op) {
 	case ofi_op_atomic:
 	case ofi_op_atomic_fetch:
-		cmd->msg.hdr.size = ofi_copy_from_iov(cmd->msg.data.msg,
-						SMR_MSG_DATA_LEN, iov, count, 0);
+		cmd->msg.hdr.size = ofi_copy_from_hmem_iov(cmd->msg.data.msg,
+						SMR_MSG_DATA_LEN, iface, device,
+						iov, count, 0);
 		break;
 	case ofi_op_atomic_compare:
-		cmd->msg.hdr.size = ofi_copy_from_iov(cmd->msg.data.buf,
-						SMR_MSG_DATA_LEN, iov, count, 0);
-		comp_size = ofi_copy_from_iov(cmd->msg.data.comp,
-					      SMR_MSG_DATA_LEN, compv,
-					      comp_count, 0);
+		cmd->msg.hdr.size = ofi_copy_from_hmem_iov(cmd->msg.data.buf,
+						SMR_MSG_DATA_LEN, iface, device,
+						iov, count, 0);
+		comp_size = ofi_copy_from_hmem_iov(cmd->msg.data.comp,
+						SMR_MSG_DATA_LEN, iface, device,
+						compv, comp_count, 0);
 		if (comp_size != cmd->msg.hdr.size)
 			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
 				"atomic and compare buffer size mismatch\n");
@@ -83,6 +87,7 @@ static void smr_format_inline_atomic(struct smr_cmd *cmd,
 }
 
 static void smr_format_inject_atomic(struct smr_cmd *cmd,
+			enum fi_hmem_iface iface, uint64_t device,
 			const struct iovec *iov, size_t count,
 			const struct iovec *resultv, size_t result_count,
 			const struct iovec *compv, size_t comp_count,
@@ -99,14 +104,16 @@ static void smr_format_inject_atomic(struct smr_cmd *cmd,
 		if (cmd->msg.hdr.atomic_op == FI_ATOMIC_READ)
 			cmd->msg.hdr.size = ofi_total_iov_len(resultv, result_count);
 		else
-			cmd->msg.hdr.size = ofi_copy_from_iov(tx_buf->data,
-						SMR_INJECT_SIZE, iov, count, 0);
+			cmd->msg.hdr.size = ofi_copy_from_hmem_iov(tx_buf->data,
+						SMR_INJECT_SIZE, iface, device,
+						iov, count, 0);
 		break;
 	case ofi_op_atomic_compare:
-		cmd->msg.hdr.size = ofi_copy_from_iov(tx_buf->buf,
-						SMR_COMP_INJECT_SIZE, iov, count, 0);
-		comp_size = ofi_copy_from_iov(tx_buf->comp, SMR_COMP_INJECT_SIZE,
-					      compv, comp_count, 0);
+		cmd->msg.hdr.size = ofi_copy_from_hmem_iov(tx_buf->buf,
+						SMR_COMP_INJECT_SIZE, iface, device,
+						iov, count, 0);
+		comp_size = ofi_copy_from_hmem_iov(tx_buf->comp, SMR_COMP_INJECT_SIZE,
+					      iface, device, compv, comp_count, 0);
 		if (comp_size != cmd->msg.hdr.size)
 			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
 				"atomic and compare buffer size mismatch\n");
@@ -134,7 +141,10 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep,
 	struct iovec iov[SMR_IOV_LIMIT];
 	struct iovec compare_iov[SMR_IOV_LIMIT];
 	struct iovec result_iov[SMR_IOV_LIMIT];
-	int id, peer_id, err = 0;
+	enum fi_hmem_iface iface;
+	uint64_t device;
+	int64_t id, peer_id;
+	int err = 0;
 	uint16_t flags = 0;
 	ssize_t ret = 0;
 	size_t total_len;
@@ -144,14 +154,13 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep,
 	assert(compare_count <= SMR_IOV_LIMIT);
 	assert(rma_count <= SMR_IOV_LIMIT);
 
-	id = (int) addr;
-	peer_id = smr_peer_data(ep->region)[id].addr.addr;
-
-	ret = smr_verify_peer(ep, id);
-	if (ret)
-		return ret;
+	id = smr_verify_peer(ep, addr);
+	if (id < 0)
+		return -FI_EAGAIN;
 
+	peer_id = smr_peer_data(ep->region)[id].addr.id;
 	peer_smr = smr_peer_region(ep->region, id);
+
 	fastlock_acquire(&peer_smr->lock);
 	if (peer_smr->cmd_cnt < 2 || smr_peer_data(ep->region)[id].sar_status) {
 		ret = -FI_EAGAIN;
@@ -192,16 +201,18 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep,
 		break;
 	}
 
+	iface = smr_get_mr_hmem_iface(ep->util_ep.domain, desc, &device);
+
 	smr_generic_format(cmd, peer_id, op, 0, 0, op_flags);
 	smr_generic_atomic_format(cmd, datatype, atomic_op);
 
 	if (total_len <= SMR_MSG_DATA_LEN && !(flags & SMR_RMA_REQ) &&
 	    !(op_flags & FI_DELIVERY_COMPLETE)) {
-		smr_format_inline_atomic(cmd, iov, count, compare_iov,
+		smr_format_inline_atomic(cmd, iface, device, iov, count, compare_iov,
 					 compare_count);
 	} else if (total_len <= SMR_INJECT_SIZE) {
 		tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr));
-		smr_format_inject_atomic(cmd, iov, count, result_iov,
+		smr_format_inject_atomic(cmd, iface, device, iov, count, result_iov,
 					 result_count, compare_iov, compare_count,
 					 peer_smr, tx_buf);
 		if (flags & SMR_RMA_REQ || op_flags & FI_DELIVERY_COMPLETE) {
@@ -212,7 +223,7 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep,
 			}
 			resp = ofi_cirque_tail(smr_resp_queue(ep->region));
 			pend = freestack_pop(ep->pend_fs);
-			smr_format_pend_resp(pend, cmd, context, result_iov,
+			smr_format_pend_resp(pend, cmd, context, iface, device, result_iov,
 					     result_count, id, resp);
 			cmd->msg.hdr.data = smr_get_offset(ep->region, resp);
 			ofi_cirque_commit(smr_resp_queue(ep->region));
@@ -314,7 +325,7 @@ static ssize_t smr_atomic_inject(struct fid_ep *ep_fid, const void *buf,
 	struct smr_cmd *cmd;
 	struct iovec iov;
 	struct fi_rma_ioc rma_ioc;
-	int id, peer_id;
+	int64_t id, peer_id;
 	ssize_t ret = 0;
 	size_t total_len;
 
@@ -322,14 +333,13 @@ static ssize_t smr_atomic_inject(struct fid_ep *ep_fid, const void *buf,
 
 	ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid);
 
-	id = (int) dest_addr;
-	peer_id = smr_peer_data(ep->region)[id].addr.addr;
-
-	ret = smr_verify_peer(ep, id);
-	if (ret)
-		return ret;
+	id = smr_verify_peer(ep, dest_addr);
+	if (id < 0)
+		return -FI_EAGAIN;
 
+	peer_id = smr_peer_data(ep->region)[id].addr.id;
 	peer_smr = smr_peer_region(ep->region, id);
+
 	fastlock_acquire(&peer_smr->lock);
 	if (peer_smr->cmd_cnt < 2 || smr_peer_data(ep->region)[id].sar_status) {
 		ret = -FI_EAGAIN;
@@ -350,11 +360,11 @@ static ssize_t smr_atomic_inject(struct fid_ep *ep_fid, const void *buf,
 	smr_generic_atomic_format(cmd, datatype, op);
 
 	if (total_len <= SMR_MSG_DATA_LEN) {
-		smr_format_inline_atomic(cmd, &iov, 1, NULL, 0);
+		smr_format_inline_atomic(cmd, FI_HMEM_SYSTEM, 0, &iov, 1, NULL, 0);
 	} else if (total_len <= SMR_INJECT_SIZE) {
 		tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr));
-		smr_format_inject_atomic(cmd, &iov, 1, NULL, 0, NULL, 0,
-					 peer_smr, tx_buf);
+		smr_format_inject_atomic(cmd, FI_HMEM_SYSTEM, 0, &iov, 1, NULL,
+					 0, NULL, 0, peer_smr, tx_buf);
 	}
 
 	ofi_cirque_commit(smr_cmd_queue(peer_smr));
diff --git a/deps/libfabric/prov/shm/src/smr_attr.c b/deps/libfabric/prov/shm/src/smr_attr.c
index 2c86d5cb03a8fbfe37d036eb690210d087c57a3c..34026d5964f403bd01c8e840e8a4c908f6d7cccc 100644
--- a/deps/libfabric/prov/shm/src/smr_attr.c
+++ b/deps/libfabric/prov/shm/src/smr_attr.c
@@ -60,6 +60,26 @@ struct fi_rx_attr smr_rx_attr = {
 	.iov_limit = SMR_IOV_LIMIT
 };
 
+struct fi_tx_attr smr_hmem_tx_attr = {
+	.caps = SMR_TX_CAPS | FI_HMEM,
+	.op_flags = SMR_TX_OP_FLAGS,
+	.comp_order = FI_ORDER_NONE,
+	.msg_order = SMR_RMA_ORDER | FI_ORDER_SAS,
+	.inject_size = 0,
+	.size = 1024,
+	.iov_limit = SMR_IOV_LIMIT,
+	.rma_iov_limit = SMR_IOV_LIMIT
+};
+
+struct fi_rx_attr smr_hmem_rx_attr = {
+	.caps = SMR_RX_CAPS | FI_HMEM,
+	.op_flags = SMR_RX_OP_FLAGS,
+	.comp_order = FI_ORDER_STRICT,
+	.msg_order = SMR_RMA_ORDER | FI_ORDER_SAS,
+	.size = 1024,
+	.iov_limit = SMR_IOV_LIMIT
+};
+
 struct fi_ep_attr smr_ep_attr = {
 	.type = FI_EP_RDM,
 	.protocol = FI_PROTO_SHM,
@@ -98,6 +118,16 @@ struct fi_fabric_attr smr_fabric_attr = {
 	.prov_version = OFI_VERSION_DEF_PROV
 };
 
+struct fi_info smr_hmem_info = {
+	.caps = SMR_TX_CAPS | SMR_RX_CAPS | FI_HMEM | FI_MULTI_RECV,
+	.addr_format = FI_ADDR_STR,
+	.tx_attr = &smr_hmem_tx_attr,
+	.rx_attr = &smr_hmem_rx_attr,
+	.ep_attr = &smr_ep_attr,
+	.domain_attr = &smr_domain_attr,
+	.fabric_attr = &smr_fabric_attr
+};
+
 struct fi_info smr_info = {
 	.caps = SMR_TX_CAPS | SMR_RX_CAPS | FI_MULTI_RECV,
 	.addr_format = FI_ADDR_STR,
@@ -105,5 +135,6 @@ struct fi_info smr_info = {
 	.rx_attr = &smr_rx_attr,
 	.ep_attr = &smr_ep_attr,
 	.domain_attr = &smr_domain_attr,
-	.fabric_attr = &smr_fabric_attr
+	.fabric_attr = &smr_fabric_attr,
+	.next = &smr_hmem_info,
 };
diff --git a/deps/libfabric/prov/shm/src/smr_av.c b/deps/libfabric/prov/shm/src/smr_av.c
index fd5c5cad0f5a460703d21ae982db0d17008e6e43..7ce79167af1abe0d1fa07117b8c19b583a981675 100644
--- a/deps/libfabric/prov/shm/src/smr_av.c
+++ b/deps/libfabric/prov/shm/src/smr_av.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017 Intel Corporation. All rights reserved.
+ * Copyright (c) 2015-2020 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -52,7 +52,7 @@ static int smr_av_close(struct fid *fid)
 
 /*
  * Input address: smr name (string)
- * output address: index (integer), the output from util_av and peer index in map
+ * output address: index (fi_addr_t), the output from util_av
  */
 static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count,
 			 fi_addr_t *fi_addr, uint64_t flags, void *context)
@@ -63,7 +63,8 @@ static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count,
 	struct smr_ep *smr_ep;
 	struct dlist_entry *av_entry;
 	const char *ep_name;
-	fi_addr_t index;
+	fi_addr_t util_addr;
+	int64_t shm_id = -1;
 	int i, ret;
 	int succ_count = 0;
 
@@ -73,35 +74,39 @@ static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count,
 	for (i = 0; i < count; i++, addr = (char *) addr + strlen(addr) + 1) {
 		if (smr_av->used < SMR_MAX_PEERS) {
 			ep_name = smr_no_prefix(addr);
-			ret = ofi_av_insert_addr(util_av, ep_name, &index);
+			ret = smr_map_add(&smr_prov, smr_av->smr_map,
+					  ep_name, &shm_id);
+			if (!ret)
+				ret = ofi_av_insert_addr(util_av, &shm_id,
+							 &util_addr);
 		} else {
 			FI_WARN(&smr_prov, FI_LOG_AV,
 				"AV insert failed. The maximum number of AV "
 				"entries shm supported has been reached.\n");
+			util_addr = FI_ADDR_NOTAVAIL;
 			ret = -FI_ENOMEM;
 		}
+
+		if (fi_addr)
+			fi_addr[i] = util_addr;
+
 		if (ret) {
 			if (util_av->eq)
 				ofi_av_write_event(util_av, i, -ret, context);
+			if (shm_id >= 0)
+				smr_map_del(smr_av->smr_map, shm_id);
+			continue;
 		} else {
-			ret = smr_map_add(&smr_prov, smr_av->smr_map,
-					  ep_name, index);
-			if (ret) {
-				if (util_av->eq)
-					ofi_av_write_event(util_av, i, -ret, context);
-			} else {
-				succ_count++;
-				smr_av->used++;
-			}
+			assert(shm_id >= 0 && shm_id < SMR_MAX_PEERS);
+			smr_av->smr_map->peers[shm_id].fiaddr = util_addr;
+			succ_count++;
+			smr_av->used++;
 		}
 
-		if (fi_addr)
-			fi_addr[i] = (ret == 0) ? index : FI_ADDR_NOTAVAIL;
-
 		dlist_foreach(&util_av->ep_list, av_entry) {
 			util_ep = container_of(av_entry, struct util_ep, av_entry);
 			smr_ep = container_of(util_ep, struct smr_ep, util_ep);
-			smr_map_to_endpoint(smr_ep->region, index);
+			smr_map_to_endpoint(smr_ep->region, shm_id);
 		}
 	}
 
@@ -121,12 +126,14 @@ static int smr_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count
 	struct smr_ep *smr_ep;
 	struct dlist_entry *av_entry;
 	int i, ret = 0;
+	int64_t id;
 
 	util_av = container_of(av_fid, struct util_av, av_fid);
 	smr_av = container_of(util_av, struct smr_av, util_av);
 
 	fastlock_acquire(&util_av->lock);
 	for (i = 0; i < count; i++) {
+		id = smr_addr_lookup(util_av, fi_addr[i]);
 		ret = ofi_av_remove_addr(util_av, fi_addr[i]);
 		if (ret) {
 			FI_WARN(&smr_prov, FI_LOG_AV,
@@ -134,11 +141,11 @@ static int smr_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count
 			break;
 		}
 
-		smr_map_del(smr_av->smr_map, fi_addr[i]);
+		smr_map_del(smr_av->smr_map, id);
 		dlist_foreach(&util_av->ep_list, av_entry) {
 			util_ep = container_of(av_entry, struct util_ep, av_entry);
 			smr_ep = container_of(util_ep, struct smr_ep, util_ep);
-			smr_unmap_from_endpoint(smr_ep->region, fi_addr[i]);
+			smr_unmap_from_endpoint(smr_ep->region, id);
 		}
 		smr_av->used--;
 	}
@@ -153,14 +160,16 @@ static int smr_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr,
 	struct util_av *util_av;
 	struct smr_av *smr_av;
 	struct smr_region *peer_smr;
-	int peer_id = (int)fi_addr;
+	int64_t id;
 
 	util_av = container_of(av, struct util_av, av_fid);
 	smr_av = container_of(util_av, struct smr_av, util_av);
-	peer_smr = smr_map_get(smr_av->smr_map, peer_id);
+
+	id = smr_addr_lookup(util_av, fi_addr);
+	peer_smr = smr_map_get(smr_av->smr_map, id);
 
 	if (!peer_smr)
-		return -FI_ADDR_NOTAVAIL;
+		return -FI_ENODATA;
 
 	strncpy((char *)addr, smr_name(peer_smr), *addrlen);
 	((char *) addr)[MIN(*addrlen - 1, strlen(smr_name(peer_smr)))] = '\0';
@@ -220,7 +229,8 @@ int smr_av_open(struct fid_domain *domain, struct fi_av_attr *attr,
 	if (!smr_av)
 		return -FI_ENOMEM;
 
-	util_attr.addrlen = NAME_MAX;
+	util_attr.addrlen = sizeof(int64_t);
+	util_attr.context_len = 0;
 	util_attr.flags = 0;
 	if (attr->count > SMR_MAX_PEERS) {
 		ret = -FI_ENOSYS;
diff --git a/deps/libfabric/prov/shm/src/smr_comp.c b/deps/libfabric/prov/shm/src/smr_comp.c
index 1372dd626963b19d3abec34048220379c1640e6e..02c91823cffb8f82021187b32d546729e48e5853 100644
--- a/deps/libfabric/prov/shm/src/smr_comp.c
+++ b/deps/libfabric/prov/shm/src/smr_comp.c
@@ -89,16 +89,22 @@ int smr_tx_comp_signal(struct smr_ep *ep, void *context, uint32_t op,
 }
 
 int smr_complete_rx(struct smr_ep *ep, void *context, uint32_t op, uint16_t flags,
-		    size_t len, void *buf, fi_addr_t addr, uint64_t tag, uint64_t data,
+		    size_t len, void *buf, int64_t id, uint64_t tag, uint64_t data,
 		    uint64_t err)
 {
+	fi_addr_t fiaddr = FI_ADDR_UNSPEC;
+
 	ofi_ep_rx_cntr_inc_func(&ep->util_ep, op);
 
 	if (!err && !(flags & (SMR_REMOTE_CQ_DATA | SMR_RX_COMPLETION)))
 		return 0;
 
+	//TODO I was here
+	if (ep->util_ep.domain->info_domain_caps & FI_SOURCE)
+		fiaddr = ep->region->map->peers[id].fiaddr;
+
 	return ep->rx_comp(ep, context, op, flags, len, buf,
-			   addr, tag, data, err);
+			   fiaddr, tag, data, err);
 }
 
 int smr_rx_comp(struct smr_ep *ep, void *context, uint32_t op,
diff --git a/deps/libfabric/prov/shm/src/smr_ep.c b/deps/libfabric/prov/shm/src/smr_ep.c
index 4c7f8db61b6f6f2dbf573b1ff5aeb4217e2d10f3..cf8215b2b46f97bbde86e12bf3be372427cf8967 100644
--- a/deps/libfabric/prov/shm/src/smr_ep.c
+++ b/deps/libfabric/prov/shm/src/smr_ep.c
@@ -35,6 +35,7 @@
 #include <sys/uio.h>
 
 #include "ofi_iov.h"
+#include "ofi_hmem.h"
 #include "smr.h"
 
 extern struct fi_ops_msg smr_msg_ops;
@@ -142,7 +143,7 @@ static int smr_ep_cancel_recv(struct smr_ep *ep, struct smr_queue *queue,
 		recv_entry = container_of(entry, struct smr_rx_entry, entry);
 		ret = smr_complete_rx(ep, (void *) recv_entry->context, ofi_op_msg,
 				  recv_entry->flags, 0,
-				  NULL, recv_entry->addr,
+				  NULL, recv_entry->peer_id,
 				  recv_entry->tag, 0, FI_ECANCELED);
 		freestack_push(ep->recv_fs, recv_entry);
 		ret = ret ? ret : 1;
@@ -178,16 +179,59 @@ static struct fi_ops_ep smr_ep_ops = {
 	.tx_size_left = fi_no_tx_size_left,
 };
 
-int smr_verify_peer(struct smr_ep *ep, int peer_id)
+static void smr_send_name(struct smr_ep *ep, int64_t id)
 {
+	struct smr_region *peer_smr;
+	struct smr_cmd *cmd;
+	struct smr_inject_buf *tx_buf;
+
+	peer_smr = smr_peer_region(ep->region, id);
+
+	fastlock_acquire(&peer_smr->lock);
+
+	if (smr_peer_data(ep->region)[id].name_sent || !peer_smr->cmd_cnt)
+		goto out;
+
+	cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr));
+
+	cmd->msg.hdr.op = SMR_OP_MAX + ofi_ctrl_connreq;
+	cmd->msg.hdr.id = id;
+
+	tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr));
+	cmd->msg.hdr.src_data = smr_get_offset(peer_smr, tx_buf);
+
+	cmd->msg.hdr.size = strlen(smr_name(ep->region)) + 1;
+	memcpy(tx_buf->data, smr_name(ep->region), cmd->msg.hdr.size);
+
+	smr_peer_data(ep->region)[id].name_sent = 1;
+	ofi_cirque_commit(smr_cmd_queue(peer_smr));
+	peer_smr->cmd_cnt--;
+
+out:
+	fastlock_release(&peer_smr->lock);
+}
+
+int64_t smr_verify_peer(struct smr_ep *ep, fi_addr_t fi_addr)
+{
+	int64_t id;
 	int ret;
 
-	if (ep->region->map->peers[peer_id].peer.addr != FI_ADDR_UNSPEC)
-		return 0;
+	id = smr_addr_lookup(ep->util_ep.av, fi_addr);
+	assert(id < SMR_MAX_PEERS);
 
-	ret = smr_map_to_region(&smr_prov, &ep->region->map->peers[peer_id]);
+	if (smr_peer_data(ep->region)[id].addr.id >= 0)
+		return id;
 
-	return (ret == -ENOENT) ? -FI_EAGAIN : ret;
+	if (ep->region->map->peers[id].peer.id < 0) {
+		ret = smr_map_to_region(&smr_prov, &ep->region->map->peers[id]);
+		if (ret == -ENOENT)
+			return -1;
+
+	}
+
+	smr_send_name(ep, id);
+
+	return -1;
 }
 
 static int smr_match_msg(struct dlist_entry *item, const void *args)
@@ -196,7 +240,7 @@ static int smr_match_msg(struct dlist_entry *item, const void *args)
 	struct smr_rx_entry *recv_entry;
 
 	recv_entry = container_of(item, struct smr_rx_entry, entry);
-	return smr_match_addr(recv_entry->addr, attr->addr);
+	return smr_match_id(recv_entry->peer_id, attr->id);
 }
 
 static int smr_match_tagged(struct dlist_entry *item, const void *args)
@@ -205,7 +249,7 @@ static int smr_match_tagged(struct dlist_entry *item, const void *args)
 	struct smr_rx_entry *recv_entry;
 
 	recv_entry = container_of(item, struct smr_rx_entry, entry);
-	return smr_match_addr(recv_entry->addr, attr->addr) &&
+	return smr_match_id(recv_entry->peer_id, attr->id) &&
 	       smr_match_tag(recv_entry->tag, recv_entry->ignore, attr->tag); 
 } 
 
@@ -216,7 +260,7 @@ static int smr_match_unexp_msg(struct dlist_entry *item, const void *args)
 
 	unexp_msg = container_of(item, struct smr_unexp_msg, entry);
 	assert(unexp_msg->cmd.msg.hdr.op == ofi_op_msg);
-	return smr_match_addr(unexp_msg->cmd.msg.hdr.addr, attr->addr);
+	return smr_match_id(unexp_msg->cmd.msg.hdr.id, attr->id);
 }
 
 static int smr_match_unexp_tagged(struct dlist_entry *item, const void *args)
@@ -226,10 +270,10 @@ static int smr_match_unexp_tagged(struct dlist_entry *item, const void *args)
 
 	unexp_msg = container_of(item, struct smr_unexp_msg, entry);
 	if (unexp_msg->cmd.msg.hdr.op == ofi_op_msg)
-		return smr_match_addr(unexp_msg->cmd.msg.hdr.addr, attr->addr);
+		return smr_match_id(unexp_msg->cmd.msg.hdr.id, attr->id);
 
 	assert(unexp_msg->cmd.msg.hdr.op == ofi_op_tagged);
-	return smr_match_addr(unexp_msg->cmd.msg.hdr.addr, attr->addr) &&
+	return smr_match_id(unexp_msg->cmd.msg.hdr.id, attr->id) &&
 	       smr_match_tag(unexp_msg->cmd.msg.hdr.tag, attr->ignore,
 			     attr->tag);
 }
@@ -242,28 +286,32 @@ static void smr_init_queue(struct smr_queue *queue,
 }
 
 void smr_format_pend_resp(struct smr_tx_entry *pend, struct smr_cmd *cmd,
-			  void *context, const struct iovec *iov,
-			  uint32_t iov_count, fi_addr_t id, struct smr_resp *resp)
+			  void *context, enum fi_hmem_iface iface, uint64_t device,
+			  const struct iovec *iov, uint32_t iov_count,
+			  int64_t id, struct smr_resp *resp)
 {
 	pend->cmd = *cmd;
 	pend->context = context;
 	memcpy(pend->iov, iov, sizeof(*iov) * iov_count);
 	pend->iov_count = iov_count;
-	pend->addr = id;
+	pend->peer_id = id;
 	if (cmd->msg.hdr.op_src != smr_src_sar)
 		pend->bytes_done = 0;
 
+	pend->iface = iface;
+	pend->device = device;
+
 	resp->msg_id = (uint64_t) (uintptr_t) pend;
 	resp->status = FI_EBUSY;
 }
 
-void smr_generic_format(struct smr_cmd *cmd, fi_addr_t peer_id, uint32_t op,
+void smr_generic_format(struct smr_cmd *cmd, int64_t peer_id, uint32_t op,
 			uint64_t tag, uint64_t data, uint64_t op_flags)
 {
 	cmd->msg.hdr.op = op;
 	cmd->msg.hdr.op_flags = 0;
 	cmd->msg.hdr.tag = tag;
-	cmd->msg.hdr.addr = peer_id;
+	cmd->msg.hdr.id = peer_id;
 	cmd->msg.hdr.data = data;
 
 	if (op_flags & FI_REMOTE_CQ_DATA)
@@ -272,22 +320,23 @@ void smr_generic_format(struct smr_cmd *cmd, fi_addr_t peer_id, uint32_t op,
 		cmd->msg.hdr.op_flags |= SMR_TX_COMPLETION;
 }
 
-void smr_format_inline(struct smr_cmd *cmd, const struct iovec *iov,
-		       size_t count)
+void smr_format_inline(struct smr_cmd *cmd, enum fi_hmem_iface iface,
+		       uint64_t device, const struct iovec *iov, size_t count)
 {
 	cmd->msg.hdr.op_src = smr_src_inline;
-	cmd->msg.hdr.size = ofi_copy_from_iov(cmd->msg.data.msg,
-					      SMR_MSG_DATA_LEN, iov, count, 0);
+	cmd->msg.hdr.size = ofi_copy_from_hmem_iov(cmd->msg.data.msg,
+						SMR_MSG_DATA_LEN, iface, device,
+						iov, count, 0);
 }
 
-void smr_format_inject(struct smr_cmd *cmd, const struct iovec *iov,
-		       size_t count, struct smr_region *smr,
-		       struct smr_inject_buf *tx_buf)
+void smr_format_inject(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t device,
+		       const struct iovec *iov, size_t count,
+		       struct smr_region *smr, struct smr_inject_buf *tx_buf)
 {
 	cmd->msg.hdr.op_src = smr_src_inject;
 	cmd->msg.hdr.src_data = smr_get_offset(smr, tx_buf);
-	cmd->msg.hdr.size = ofi_copy_from_iov(tx_buf->data, SMR_INJECT_SIZE,
-					      iov, count, 0);
+	cmd->msg.hdr.size = ofi_copy_from_hmem_iov(tx_buf->data, SMR_INJECT_SIZE,
+						   iface, device, iov, count, 0);
 }
 
 void smr_format_iov(struct smr_cmd *cmd, const struct iovec *iov, size_t count,
@@ -381,14 +430,16 @@ remove_entry:
 }
 
 size_t smr_copy_to_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp,
-		       struct smr_cmd *cmd, const struct iovec *iov, size_t count,
+		       struct smr_cmd *cmd, enum fi_hmem_iface iface,
+		       uint64_t device, const struct iovec *iov, size_t count,
 		       size_t *bytes_done, int *next)
 {
 	size_t start = *bytes_done;
 
 	if (sar_msg->sar[0].status == SMR_SAR_FREE && !*next) {
-		*bytes_done += ofi_copy_from_iov(sar_msg->sar[0].buf, SMR_SAR_SIZE,
-						 iov, count, *bytes_done);
+		*bytes_done += ofi_copy_from_hmem_iov(sar_msg->sar[0].buf,
+					SMR_SAR_SIZE, iface, device,
+					iov, count, *bytes_done);
 		sar_msg->sar[0].status = SMR_SAR_READY;
 		if (cmd->msg.hdr.op == ofi_op_read_req)
 			resp->status = FI_SUCCESS;
@@ -397,8 +448,9 @@ size_t smr_copy_to_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp,
 
 	if (*bytes_done < cmd->msg.hdr.size &&
 	    sar_msg->sar[1].status == SMR_SAR_FREE && *next) {
-		*bytes_done += ofi_copy_from_iov(sar_msg->sar[1].buf, SMR_SAR_SIZE,
-						 iov, count, *bytes_done);
+		*bytes_done += ofi_copy_from_hmem_iov(sar_msg->sar[1].buf,
+					SMR_SAR_SIZE, iface, device,
+					iov, count, *bytes_done);
 		sar_msg->sar[1].status = SMR_SAR_READY;
 		if (cmd->msg.hdr.op == ofi_op_read_req)
 			resp->status = FI_SUCCESS;
@@ -408,14 +460,16 @@ size_t smr_copy_to_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp,
 }
 
 size_t smr_copy_from_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp,
-			 struct smr_cmd *cmd, const struct iovec *iov, size_t count,
+			 struct smr_cmd *cmd, enum fi_hmem_iface iface,
+			 uint64_t device, const struct iovec *iov, size_t count,
 			 size_t *bytes_done, int *next)
 {
 	size_t start = *bytes_done;
 
 	if (sar_msg->sar[0].status == SMR_SAR_READY && !*next) {
-		*bytes_done += ofi_copy_to_iov(iov, count, *bytes_done,
-					       sar_msg->sar[0].buf, SMR_SAR_SIZE);
+		*bytes_done += ofi_copy_to_hmem_iov(iface, device, iov, count,
+					*bytes_done, sar_msg->sar[0].buf,
+					SMR_SAR_SIZE);
 		sar_msg->sar[0].status = SMR_SAR_FREE;
 		if (cmd->msg.hdr.op != ofi_op_read_req)
 			resp->status = FI_SUCCESS;
@@ -424,8 +478,9 @@ size_t smr_copy_from_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp,
 
 	if (*bytes_done < cmd->msg.hdr.size &&
 	    sar_msg->sar[1].status == SMR_SAR_READY && *next) {
-		*bytes_done += ofi_copy_to_iov(iov, count, *bytes_done,
-					       sar_msg->sar[1].buf, SMR_SAR_SIZE);
+		*bytes_done += ofi_copy_to_hmem_iov(iface, device, iov, count,
+					*bytes_done, sar_msg->sar[1].buf,
+					SMR_SAR_SIZE);
 		sar_msg->sar[1].status = SMR_SAR_FREE;
 		if (cmd->msg.hdr.op != ofi_op_read_req)
 			resp->status = FI_SUCCESS;
@@ -434,7 +489,8 @@ size_t smr_copy_from_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp,
 	return *bytes_done - start;
 }
 
-void smr_format_sar(struct smr_cmd *cmd, const struct iovec *iov, size_t count,
+void smr_format_sar(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t device,
+		    const struct iovec *iov, size_t count,
 		    size_t total_len, struct smr_region *smr,
 		    struct smr_region *peer_smr, struct smr_sar_msg *sar_msg,
 		    struct smr_tx_entry *pending, struct smr_resp *resp)
@@ -449,7 +505,7 @@ void smr_format_sar(struct smr_cmd *cmd, const struct iovec *iov, size_t count,
 	sar_msg->sar[0].status = SMR_SAR_FREE;
 	sar_msg->sar[1].status = SMR_SAR_FREE;
 	if (cmd->msg.hdr.op != ofi_op_read_req)
-		smr_copy_to_sar(sar_msg, NULL, cmd, iov, count,
+		smr_copy_to_sar(sar_msg, NULL, cmd, iface, device ,iov, count,
 				&pending->bytes_done, &pending->next);
 }
 
@@ -467,6 +523,8 @@ static int smr_ep_close(struct fid *fid)
 	smr_recv_fs_free(ep->recv_fs);
 	smr_unexp_fs_free(ep->unexp_fs);
 	smr_pend_fs_free(ep->pend_fs);
+	smr_sar_fs_free(ep->sar_fs);
+	free((void *)ep->name);
 	free(ep);
 	return 0;
 }
@@ -616,16 +674,16 @@ static int smr_endpoint_name(char *name, char *addr, size_t addrlen,
 			     int dom_idx, int ep_idx)
 {
 	const char *start;
-	memset(name, 0, NAME_MAX);
-	if (!addr || addrlen > NAME_MAX)
+	memset(name, 0, SMR_NAME_MAX);
+	if (!addr || addrlen > SMR_NAME_MAX)
 		return -FI_EINVAL;
 
 	start = smr_no_prefix((const char *) addr);
 	if (strstr(addr, SMR_PREFIX) || dom_idx || ep_idx)
-		snprintf(name, NAME_MAX, "%s:%d:%d:%d", start, getuid(), dom_idx,
-			 ep_idx);
+		snprintf(name, SMR_NAME_MAX - 1, "%s:%d:%d:%d", start, getuid(),
+			 dom_idx, ep_idx);
 	else
-		snprintf(name, NAME_MAX, "%s", start);
+		snprintf(name, SMR_NAME_MAX - 1, "%s", start);
 
 	return 0;
 }
@@ -636,7 +694,7 @@ int smr_endpoint(struct fid_domain *domain, struct fi_info *info,
 	struct smr_ep *ep;
 	struct smr_domain *smr_domain;
 	int ret, ep_idx;
-	char name[NAME_MAX];
+	char name[SMR_NAME_MAX];
 
 	ep = calloc(1, sizeof(*ep));
 	if (!ep)
@@ -652,7 +710,7 @@ int smr_endpoint(struct fid_domain *domain, struct fi_info *info,
 	if (ret)
 		goto err2;
 
-	ret = smr_setname(&ep->util_ep.ep_fid.fid, name, NAME_MAX);
+	ret = smr_setname(&ep->util_ep.ep_fid.fid, name, SMR_NAME_MAX);
 	if (ret)
 		goto err2;
 
diff --git a/deps/libfabric/prov/shm/src/smr_init.c b/deps/libfabric/prov/shm/src/smr_init.c
index b724335357c50c17a61c0e8c9635f3bbad682137..5f41bece78b427ed15ec67fdf5239ffb2eb83517 100644
--- a/deps/libfabric/prov/shm/src/smr_init.c
+++ b/deps/libfabric/prov/shm/src/smr_init.c
@@ -35,6 +35,7 @@
 #include <ofi_prov.h>
 #include "smr.h"
 #include "smr_signal.h"
+#include <ofi_hmem.h>
 
 extern struct sigaction *old_action;
 struct smr_env smr_env = {
@@ -51,21 +52,21 @@ static void smr_init_env(void)
 static void smr_resolve_addr(const char *node, const char *service,
 			     char **addr, size_t *addrlen)
 {
-	char temp_name[NAME_MAX];
+	char temp_name[SMR_NAME_MAX];
 
 	if (service) {
 		if (node)
-			snprintf(temp_name, NAME_MAX - 1, "%s%s:%s",
+			snprintf(temp_name, SMR_NAME_MAX - 1, "%s%s:%s",
 				 SMR_PREFIX_NS, node, service);
 		else
-			snprintf(temp_name, NAME_MAX - 1, "%s%s",
+			snprintf(temp_name, SMR_NAME_MAX - 1, "%s%s",
 				 SMR_PREFIX_NS, service);
 	} else {
 		if (node)
-			snprintf(temp_name, NAME_MAX - 1, "%s%s",
+			snprintf(temp_name, SMR_NAME_MAX - 1, "%s%s",
 				 SMR_PREFIX, node);
 		else
-			snprintf(temp_name, NAME_MAX - 1, "%s%d",
+			snprintf(temp_name, SMR_NAME_MAX - 1, "%s%d",
 				 SMR_PREFIX, getpid());
 	}
 
@@ -125,7 +126,7 @@ static int smr_getinfo(uint32_t version, const char *node, const char *service,
 	int ret;
 
 	mr_mode = hints && hints->domain_attr ? hints->domain_attr->mr_mode :
-						FI_MR_VIRT_ADDR;
+						FI_MR_VIRT_ADDR | FI_MR_HMEM;
 	msg_order = hints && hints->tx_attr ? hints->tx_attr->msg_order : 0;
 	fast_rma = smr_fast_rma_enabled(mr_mode, msg_order);
 
@@ -160,12 +161,24 @@ static int smr_getinfo(uint32_t version, const char *node, const char *service,
 			cur->ep_attr->max_order_waw_size = 0;
 			cur->ep_attr->max_order_war_size = 0;
 		}
+		if (cur->caps & FI_HMEM) {
+			if (!(mr_mode & FI_MR_HMEM)) {
+				fi_freeinfo(cur);
+				return -FI_ENODATA;
+			}
+			cur->domain_attr->mr_mode |= FI_MR_HMEM;
+		} else {
+			cur->domain_attr->mr_mode &= ~FI_MR_HMEM;
+		}
 	}
 	return 0;
 }
 
 static void smr_fini(void)
 {
+#if HAVE_SHM_DL
+	ofi_hmem_cleanup();
+#endif
 	smr_cleanup();
 	free(old_action);
 }
@@ -187,6 +200,9 @@ struct util_prov smr_util_prov = {
 
 SHM_INI
 {
+#if HAVE_SHM_DL
+	ofi_hmem_init();
+#endif
 	fi_param_define(&smr_prov, "sar_threshold", FI_PARAM_SIZE_T,
 			"Max size to use for alternate SAR protocol if CMA \
 			 is not available before switching to mmap protocol \
diff --git a/deps/libfabric/prov/shm/src/smr_msg.c b/deps/libfabric/prov/shm/src/smr_msg.c
index 5ac6b15b9111c2627eb537a67e5b041412d4891d..1eb86200ba6fb68cf6b340d8b50e47dfece33ec8 100644
--- a/deps/libfabric/prov/shm/src/smr_msg.c
+++ b/deps/libfabric/prov/shm/src/smr_msg.c
@@ -51,7 +51,7 @@ static inline uint16_t smr_convert_rx_flags(uint64_t fi_flags)
 }
 
 static struct smr_rx_entry *smr_get_recv_entry(struct smr_ep *ep,
-		const struct iovec *iov, size_t count, fi_addr_t addr,
+		const struct iovec *iov, void **desc, size_t count, fi_addr_t addr,
 		void *context, uint64_t tag, uint64_t ignore, uint64_t flags)
 {
 	struct smr_rx_entry *entry;
@@ -70,14 +70,19 @@ static struct smr_rx_entry *smr_get_recv_entry(struct smr_ep *ep,
 	entry->context = context;
 	entry->err = 0;
 	entry->flags = smr_convert_rx_flags(flags);
-	entry->addr = ep->util_ep.caps & FI_DIRECTED_RECV ? addr : FI_ADDR_UNSPEC;
+	entry->peer_id = ep->util_ep.caps & FI_DIRECTED_RECV &&
+				addr != FI_ADDR_UNSPEC ?
+				smr_addr_lookup(ep->util_ep.av, addr) : -1;
 	entry->tag = tag;
 	entry->ignore = ignore;
 
+	entry->iface = smr_get_mr_hmem_iface(ep->util_ep.domain, desc,
+					     &entry->device);
+
 	return entry;
 }
 
-ssize_t smr_generic_recv(struct smr_ep *ep, const struct iovec *iov,
+ssize_t smr_generic_recv(struct smr_ep *ep, const struct iovec *iov, void **desc,
 			 size_t iov_count, fi_addr_t addr, void *context,
 			 uint64_t tag, uint64_t ignore, uint64_t flags,
 			 struct smr_queue *recv_queue,
@@ -92,7 +97,7 @@ ssize_t smr_generic_recv(struct smr_ep *ep, const struct iovec *iov,
 	fastlock_acquire(&ep->region->lock);
 	fastlock_acquire(&ep->util_ep.rx_cq->cq_lock);
 
-	entry = smr_get_recv_entry(ep, iov, iov_count, addr, context, tag,
+	entry = smr_get_recv_entry(ep, iov, desc, iov_count, addr, context, tag,
 				   ignore, flags);
 	if (!entry)
 		goto out;
@@ -112,8 +117,8 @@ ssize_t smr_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg,
 
 	ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid);
 
-	return smr_generic_recv(ep, msg->msg_iov, msg->iov_count, msg->addr,
-				msg->context, 0, 0,
+	return smr_generic_recv(ep, msg->msg_iov, msg->desc, msg->iov_count,
+				msg->addr, msg->context, 0, 0,
 				flags | ep->util_ep.rx_msg_flags,
 				&ep->recv_queue, &ep->unexp_msg_queue);
 }
@@ -125,7 +130,7 @@ ssize_t smr_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
 
 	ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid);
 
-	return smr_generic_recv(ep, iov, count, src_addr, context, 0, 0,
+	return smr_generic_recv(ep, iov, desc, count, src_addr, context, 0, 0,
 				smr_ep_rx_flags(ep), &ep->recv_queue,
 				&ep->unexp_msg_queue);
 }
@@ -141,15 +146,15 @@ ssize_t smr_recv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc,
 	iov.iov_base = buf;
 	iov.iov_len = len;
 
-	return smr_generic_recv(ep, &iov, 1, src_addr, context, 0, 0,
+	return smr_generic_recv(ep, &iov, &desc, 1, src_addr, context, 0, 0,
 				smr_ep_rx_flags(ep), &ep->recv_queue,
 				&ep->unexp_msg_queue);
 }
 
 static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov,
-				   size_t iov_count, fi_addr_t addr, uint64_t tag,
-				   uint64_t data, void *context, uint32_t op,
-				   uint64_t op_flags)
+				   void **desc, size_t iov_count, fi_addr_t addr,
+				   uint64_t tag, uint64_t data, void *context,
+				   uint32_t op, uint64_t op_flags)
 {
 	struct smr_region *peer_smr;
 	struct smr_inject_buf *tx_buf;
@@ -157,22 +162,23 @@ static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov,
 	struct smr_resp *resp;
 	struct smr_cmd *cmd;
 	struct smr_tx_entry *pend;
-	int id, peer_id;
+	enum fi_hmem_iface iface;
+	uint64_t device;
+	int64_t id, peer_id;
 	ssize_t ret = 0;
 	size_t total_len;
 
 	assert(iov_count <= SMR_IOV_LIMIT);
 
-	id = (int) addr;
-	peer_id = smr_peer_data(ep->region)[id].addr.addr;
-
-	ret = smr_verify_peer(ep, id);
-	if (ret)
-		return ret;
+	id = smr_verify_peer(ep, addr);
+	if (id < 0)
+		return -FI_EAGAIN;
 
+	peer_id = smr_peer_data(ep->region)[id].addr.id;
 	peer_smr = smr_peer_region(ep->region, id);
+
 	fastlock_acquire(&peer_smr->lock);
-	if (!peer_smr->cmd_cnt || smr_peer_data(ep->region)[id].sar_status) {
+	if (!peer_smr->cmd_cnt || smr_peer_data(ep->region)[peer_id].sar_status) {
 		ret = -FI_EAGAIN;
 		goto unlock_region;
 	}
@@ -183,17 +189,19 @@ static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov,
 		goto unlock_cq;
 	}
 
+	iface = smr_get_mr_hmem_iface(ep->util_ep.domain, desc, &device);
+
 	total_len = ofi_total_iov_len(iov, iov_count);
 
 	cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr));
 	smr_generic_format(cmd, peer_id, op, tag, data, op_flags);
 
 	if (total_len <= SMR_MSG_DATA_LEN && !(op_flags & FI_DELIVERY_COMPLETE)) {
-		smr_format_inline(cmd, iov, iov_count);
+		smr_format_inline(cmd, iface, device, iov, iov_count);
 	} else if (total_len <= SMR_INJECT_SIZE &&
 		   !(op_flags & FI_DELIVERY_COMPLETE)) {
 		tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr));
-		smr_format_inject(cmd, iov, iov_count, peer_smr, tx_buf);
+		smr_format_inject(cmd, iface, device, iov, iov_count, peer_smr, tx_buf);
 	} else {
 		if (ofi_cirque_isfull(smr_resp_queue(ep->region))) {
 			ret = -FI_EAGAIN;
@@ -201,15 +209,18 @@ static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov,
 		}
 		resp = ofi_cirque_tail(smr_resp_queue(ep->region));
 		pend = freestack_pop(ep->pend_fs);
-		if (ep->region->cma_cap == SMR_CMA_CAP_ON) {
-			smr_format_iov(cmd, iov, iov_count, total_len, ep->region, resp);
+		if (smr_cma_enabled(ep, peer_smr) && iface == FI_HMEM_SYSTEM) {
+			smr_format_iov(cmd, iov, iov_count, total_len, ep->region,
+				       resp);
 		} else {
-			if (total_len <= smr_env.sar_threshold) {
+			if (total_len <= smr_env.sar_threshold ||
+			    iface != FI_HMEM_SYSTEM) {
 				if (!peer_smr->sar_cnt) {
 					ret = -FI_EAGAIN;
 				} else {
 					sar = smr_freestack_pop(smr_sar_pool(peer_smr));
-					smr_format_sar(cmd, iov, iov_count, total_len,
+					smr_format_sar(cmd, iface, device, iov,
+						       iov_count, total_len,
 						       ep->region, peer_smr, sar,
 						       pend, resp);
 					peer_smr->sar_cnt--;
@@ -225,7 +236,8 @@ static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov,
 				goto unlock_cq;
 			}
 		}
-		smr_format_pend_resp(pend, cmd, context, iov, iov_count, id, resp);
+		smr_format_pend_resp(pend, cmd, context, iface, device, iov,
+				     iov_count, id, resp);
 		ofi_cirque_commit(smr_resp_queue(ep->region));
 		goto commit;
 	}
@@ -257,7 +269,7 @@ ssize_t smr_send(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc,
 	msg_iov.iov_base = (void *) buf;
 	msg_iov.iov_len = len;
 
-	return smr_generic_sendmsg(ep, &msg_iov, 1, dest_addr, 0,
+	return smr_generic_sendmsg(ep, &msg_iov, &desc, 1, dest_addr, 0,
 				   0, context, ofi_op_msg, smr_ep_tx_flags(ep));
 }
 
@@ -269,7 +281,7 @@ ssize_t smr_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
 
 	ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid);
 
-	return smr_generic_sendmsg(ep, iov, count, dest_addr, 0,
+	return smr_generic_sendmsg(ep, iov, desc, count, dest_addr, 0,
 				   0, context, ofi_op_msg, smr_ep_tx_flags(ep));
 }
 
@@ -280,7 +292,7 @@ ssize_t smr_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg,
 
 	ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid);
 
-	return smr_generic_sendmsg(ep, msg->msg_iov, msg->iov_count,
+	return smr_generic_sendmsg(ep, msg->msg_iov, msg->desc, msg->iov_count,
 				   msg->addr, 0, msg->data, msg->context,
 				   ofi_op_msg, flags | ep->util_ep.tx_msg_flags);
 }
@@ -293,7 +305,7 @@ static ssize_t smr_generic_inject(struct fid_ep *ep_fid, const void *buf,
 	struct smr_region *peer_smr;
 	struct smr_inject_buf *tx_buf;
 	struct smr_cmd *cmd;
-	int id, peer_id;
+	int64_t id, peer_id;
 	ssize_t ret = 0;
 	struct iovec msg_iov;
 
@@ -303,14 +315,14 @@ static ssize_t smr_generic_inject(struct fid_ep *ep_fid, const void *buf,
 	msg_iov.iov_len = len;
 
 	ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid);
-	id = (int) dest_addr;
-	peer_id = smr_peer_data(ep->region)[id].addr.addr;
 
-	ret = smr_verify_peer(ep, id);
-	if (ret)
-		return ret;
+	id = smr_verify_peer(ep, dest_addr);
+	if (id < 0)
+		return -FI_EAGAIN;
 
+	peer_id = smr_peer_data(ep->region)[id].addr.id;
 	peer_smr = smr_peer_region(ep->region, id);
+
 	fastlock_acquire(&peer_smr->lock);
 	if (!peer_smr->cmd_cnt || smr_peer_data(ep->region)[id].sar_status) {
 		ret = -FI_EAGAIN;
@@ -321,10 +333,11 @@ static ssize_t smr_generic_inject(struct fid_ep *ep_fid, const void *buf,
 	smr_generic_format(cmd, peer_id, op, tag, data, op_flags);
 
 	if (len <= SMR_MSG_DATA_LEN) {
-		smr_format_inline(cmd, &msg_iov, 1);
+		smr_format_inline(cmd, FI_HMEM_SYSTEM, 0, &msg_iov, 1);
 	} else {
 		tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr));
-		smr_format_inject(cmd, &msg_iov, 1, peer_smr, tx_buf);
+		smr_format_inject(cmd, FI_HMEM_SYSTEM, 0, &msg_iov, 1,
+				  peer_smr, tx_buf);
 	}
 	ofi_ep_tx_cntr_inc_func(&ep->util_ep, op);
 	peer_smr->cmd_cnt--;
@@ -354,8 +367,8 @@ ssize_t smr_senddata(struct fid_ep *ep_fid, const void *buf, size_t len,
 	iov.iov_base = (void *) buf;
 	iov.iov_len = len;
 
-	return smr_generic_sendmsg(ep, &iov, 1, dest_addr, 0, data, context,
-				   ofi_op_msg,
+	return smr_generic_sendmsg(ep, &iov, &desc, 1, dest_addr, 0, data,
+				   context, ofi_op_msg,
 				   FI_REMOTE_CQ_DATA | smr_ep_tx_flags(ep));
 }
 
@@ -390,7 +403,7 @@ ssize_t smr_trecv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc,
 	iov.iov_base = buf;
 	iov.iov_len = len;
 
-	return smr_generic_recv(ep, &iov, 1, src_addr, context, tag, ignore,
+	return smr_generic_recv(ep, &iov, &desc, 1, src_addr, context, tag, ignore,
 				smr_ep_rx_flags(ep), &ep->trecv_queue,
 				&ep->unexp_tagged_queue);
 }
@@ -403,7 +416,7 @@ ssize_t smr_trecvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
 
 	ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid);
 
-	return smr_generic_recv(ep, iov, count, src_addr, context, tag, ignore,
+	return smr_generic_recv(ep, iov, desc, count, src_addr, context, tag, ignore,
 				smr_ep_rx_flags(ep), &ep->trecv_queue,
 				&ep->unexp_tagged_queue);
 }
@@ -415,8 +428,8 @@ ssize_t smr_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg,
 
 	ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid);
 
-	return smr_generic_recv(ep, msg->msg_iov, msg->iov_count, msg->addr,
-				msg->context, msg->tag, msg->ignore,
+	return smr_generic_recv(ep, msg->msg_iov, msg->desc, msg->iov_count,
+				msg->addr, msg->context, msg->tag, msg->ignore,
 				flags | ep->util_ep.rx_msg_flags,
 				&ep->trecv_queue, &ep->unexp_tagged_queue);
 }
@@ -432,7 +445,7 @@ ssize_t smr_tsend(struct fid_ep *ep_fid, const void *buf, size_t len,
 	msg_iov.iov_base = (void *) buf;
 	msg_iov.iov_len = len;
 
-	return smr_generic_sendmsg(ep, &msg_iov, 1, dest_addr, tag,
+	return smr_generic_sendmsg(ep, &msg_iov, &desc, 1, dest_addr, tag,
 				   0, context, ofi_op_tagged,
 				   smr_ep_tx_flags(ep));
 }
@@ -445,7 +458,7 @@ ssize_t smr_tsendv(struct fid_ep *ep_fid, const struct iovec *iov,
 
 	ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid);
 
-	return smr_generic_sendmsg(ep, iov, count, dest_addr, tag,
+	return smr_generic_sendmsg(ep, iov, desc, count, dest_addr, tag,
 				   0, context, ofi_op_tagged,
 				   smr_ep_tx_flags(ep));
 }
@@ -457,7 +470,7 @@ ssize_t smr_tsendmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg,
 
 	ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid);
 
-	return smr_generic_sendmsg(ep, msg->msg_iov, msg->iov_count,
+	return smr_generic_sendmsg(ep, msg->msg_iov, msg->desc, msg->iov_count,
 				   msg->addr, msg->tag, msg->data, msg->context,
 				   ofi_op_tagged, flags | ep->util_ep.tx_msg_flags);
 }
@@ -481,8 +494,8 @@ ssize_t smr_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t len,
 	iov.iov_base = (void *) buf;
 	iov.iov_len = len;
 
-	return smr_generic_sendmsg(ep, &iov, 1, dest_addr, tag, data, context,
-				   ofi_op_tagged,
+	return smr_generic_sendmsg(ep, &iov, &desc, 1, dest_addr, tag, data,
+				   context, ofi_op_tagged,
 				   FI_REMOTE_CQ_DATA | smr_ep_tx_flags(ep));
 }
 
diff --git a/deps/libfabric/prov/shm/src/smr_progress.c b/deps/libfabric/prov/shm/src/smr_progress.c
index 35d056f848eed695fce2253802ebf183ed0469d5..ea0f7a6eb9e7f395c685a4252d30f5eff1b8f1c2 100644
--- a/deps/libfabric/prov/shm/src/smr_progress.c
+++ b/deps/libfabric/prov/shm/src/smr_progress.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2018 Intel Corporation. All rights reserved
+ * Copyright (c) 2013-2020 Intel Corporation. All rights reserved
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -35,27 +35,30 @@
 #include <sys/uio.h>
 
 #include "ofi_iov.h"
+#include "ofi_hmem.h"
 #include "smr.h"
 
 
 static inline void smr_try_progress_to_sar(struct smr_sar_msg *sar_msg,
 				struct smr_resp *resp,
-				struct smr_cmd *cmd, struct iovec *iov,
+				struct smr_cmd *cmd, enum fi_hmem_iface iface,
+				uint64_t device, struct iovec *iov,
 				size_t iov_count, size_t *bytes_done, int *next)
 {
 	while (*bytes_done < cmd->msg.hdr.size &&
-	       smr_copy_to_sar(sar_msg, resp, cmd, iov, iov_count, bytes_done,
-			       next));
+	       smr_copy_to_sar(sar_msg, resp, cmd, iface, device, iov,
+			       iov_count, bytes_done, next));
 }
 
 static inline void smr_try_progress_from_sar(struct smr_sar_msg *sar_msg,
 				struct smr_resp *resp,
-				struct smr_cmd *cmd, struct iovec *iov,
+				struct smr_cmd *cmd, enum fi_hmem_iface iface,
+				uint64_t device, struct iovec *iov,
 				size_t iov_count, size_t *bytes_done, int *next)
 {
 	while (*bytes_done < cmd->msg.hdr.size &&
-	       smr_copy_from_sar(sar_msg, resp, cmd, iov, iov_count, bytes_done,
-				 next));
+	       smr_copy_from_sar(sar_msg, resp, cmd, iface, device, iov,
+				 iov_count, bytes_done, next));
 }
 
 static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp,
@@ -67,7 +70,7 @@ static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp,
 	struct smr_sar_msg *sar_msg = NULL;
 	uint8_t *src;
 
-	peer_smr = smr_peer_region(ep->region, pending->addr);
+	peer_smr = smr_peer_region(ep->region, pending->peer_id);
 
 	switch (pending->cmd.msg.hdr.op_src) {
 	case smr_src_iov:
@@ -79,16 +82,25 @@ static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp,
 		    sar_msg->sar[1].status == SMR_SAR_FREE)
 			break;
 
+		if (peer_smr != ep->region) {
+			if (fastlock_tryacquire(&peer_smr->lock))
+				return -FI_EAGAIN;
+		}
 		if (pending->cmd.msg.hdr.op == ofi_op_read_req)
 			smr_try_progress_from_sar(sar_msg, resp,
-					&pending->cmd, pending->iov,
+					&pending->cmd, pending->iface,
+					pending->device, pending->iov,
 				        pending->iov_count, &pending->bytes_done,
 					&pending->next);
 		else
 			smr_try_progress_to_sar(sar_msg, resp,
-					&pending->cmd, pending->iov,
+					&pending->cmd, pending->iface,
+					pending->device, pending->iov,
 					pending->iov_count, &pending->bytes_done,
 					&pending->next);
+		if (peer_smr != ep->region)
+			fastlock_release(&peer_smr->lock);
+
 		if (pending->bytes_done != pending->cmd.msg.hdr.size ||
 		    sar_msg->sar[0].status != SMR_SAR_FREE ||
 		    sar_msg->sar[1].status != SMR_SAR_FREE)
@@ -151,7 +163,7 @@ static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp,
 	} else if (sar_msg) {
 		smr_freestack_push(smr_sar_pool(peer_smr), sar_msg);
 		peer_smr->sar_cnt++;
-		smr_peer_data(ep->region)[pending->addr].sar_status = 0;
+		smr_peer_data(ep->region)[pending->peer_id].sar_status = 0;
 	}
 
 	if (peer_smr != ep->region)
@@ -193,11 +205,12 @@ static void smr_progress_resp(struct smr_ep *ep)
 	fastlock_release(&ep->region->lock);
 }
 
-static int smr_progress_inline(struct smr_cmd *cmd, struct iovec *iov,
+static int smr_progress_inline(struct smr_cmd *cmd, enum fi_hmem_iface iface,
+			       uint64_t device, struct iovec *iov,
 			       size_t iov_count, size_t *total_len)
 {
-	*total_len = ofi_copy_to_iov(iov, iov_count, 0, cmd->msg.data.msg,
-				     cmd->msg.hdr.size);
+	*total_len = ofi_copy_to_hmem_iov(iface, device, iov, iov_count, 0,
+					  cmd->msg.data.msg, cmd->msg.hdr.size);
 	if (*total_len != cmd->msg.hdr.size) {
 		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
 			"recv truncated");
@@ -206,7 +219,8 @@ static int smr_progress_inline(struct smr_cmd *cmd, struct iovec *iov,
 	return 0;
 }
 
-static int smr_progress_inject(struct smr_cmd *cmd, struct iovec *iov,
+static int smr_progress_inject(struct smr_cmd *cmd, enum fi_hmem_iface iface,
+			       uint64_t device, struct iovec *iov,
 			       size_t iov_count, size_t *total_len,
 			       struct smr_ep *ep, int err)
 {
@@ -222,11 +236,11 @@ static int smr_progress_inject(struct smr_cmd *cmd, struct iovec *iov,
 	}
 
 	if (cmd->msg.hdr.op == ofi_op_read_req) {
-		*total_len = ofi_copy_from_iov(tx_buf->data, cmd->msg.hdr.size,
-					       iov, iov_count, 0);
+		*total_len = ofi_copy_from_hmem_iov(tx_buf->data, cmd->msg.hdr.size,
+						    iface, device, iov, iov_count, 0);
 	} else {
-		*total_len = ofi_copy_to_iov(iov, iov_count, 0, tx_buf->data,
-					     cmd->msg.hdr.size);
+		*total_len = ofi_copy_to_hmem_iov(iface, device, iov, iov_count, 0,
+						  tx_buf->data, cmd->msg.hdr.size);
 		smr_freestack_push(smr_inject_pool(ep->region), tx_buf);
 	}
 
@@ -245,10 +259,9 @@ static int smr_progress_iov(struct smr_cmd *cmd, struct iovec *iov,
 {
 	struct smr_region *peer_smr;
 	struct smr_resp *resp;
-	int peer_id, ret;
+	int ret;
 
-	peer_id = (int) cmd->msg.hdr.addr;
-	peer_smr = smr_peer_region(ep->region, peer_id);
+	peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id);
 	resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data);
 
 	if (err) {
@@ -256,30 +269,11 @@ static int smr_progress_iov(struct smr_cmd *cmd, struct iovec *iov,
 		goto out;
 	}
 
-	if (cmd->msg.hdr.op == ofi_op_read_req) {
-		ret = ofi_process_vm_writev(peer_smr->pid, iov, iov_count,
-					    cmd->msg.data.iov,
-					    cmd->msg.data.iov_count, 0);
-	} else {
-		ret = ofi_process_vm_readv(peer_smr->pid, iov, iov_count,
-					   cmd->msg.data.iov,
-					   cmd->msg.data.iov_count, 0);
-	}
-
-	if (ret != cmd->msg.hdr.size) {
-		if (ret < 0) {
-			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
-				"CMA write error\n");
-			ret = errno;
-		} else { 
-			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
-				"partial read occurred\n");
-			ret = FI_EIO;
-		}
-	} else {
-		*total_len = ret;
-		ret = 0;
-	}
+	ret = smr_cma_loop(peer_smr->pid, iov, iov_count, cmd->msg.data.iov,
+			   cmd->msg.data.iov_count, 0, cmd->msg.hdr.size,
+			   cmd->msg.hdr.op == ofi_op_read_req);
+	if (!ret)
+		*total_len = cmd->msg.hdr.size;
 
 out:
 	//Status must be set last (signals peer: op done, valid resp entry)
@@ -292,15 +286,14 @@ static int smr_mmap_peer_copy(struct smr_ep *ep, struct smr_cmd *cmd,
 				 struct iovec *iov, size_t iov_count,
 				 size_t *total_len)
 {
-	char shm_name[NAME_MAX];
+	char shm_name[SMR_NAME_MAX];
 	void *mapped_ptr;
-	int peer_id, fd, num;
+	int fd, num;
 	int ret = 0;
 
-	peer_id = (int) cmd->msg.hdr.addr;
-
-	num = smr_mmap_name(shm_name, ep->region->map->peers[peer_id].peer.name,
-			    cmd->msg.hdr.msg_id);
+	num = smr_mmap_name(shm_name,
+			ep->region->map->peers[cmd->msg.hdr.id].peer.name,
+			cmd->msg.hdr.msg_id);
 	if (num < 0) {
 		FI_WARN(&smr_prov, FI_LOG_AV, "generating shm file name failed\n");
 		return -errno;
@@ -354,10 +347,9 @@ static int smr_progress_mmap(struct smr_cmd *cmd, struct iovec *iov,
 {
 	struct smr_region *peer_smr;
 	struct smr_resp *resp;
-	int peer_id, ret;
+	int ret;
 
-	peer_id = (int) cmd->msg.hdr.addr;
-	peer_smr = smr_peer_region(ep->region, peer_id);
+	peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id);
 	resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data);
 
 	ret = smr_mmap_peer_copy(ep, cmd, iov, iov_count, total_len);
@@ -369,8 +361,9 @@ static int smr_progress_mmap(struct smr_cmd *cmd, struct iovec *iov,
 }
 
 static struct smr_sar_entry *smr_progress_sar(struct smr_cmd *cmd,
-			struct smr_rx_entry *rx_entry, struct iovec *iov,
-			size_t iov_count, size_t *total_len, struct smr_ep *ep)
+			struct smr_rx_entry *rx_entry, enum fi_hmem_iface iface,
+			uint64_t device, struct iovec *iov, size_t iov_count,
+			size_t *total_len, struct smr_ep *ep)
 {
 	struct smr_region *peer_smr;
 	struct smr_sar_entry *sar_entry;
@@ -380,18 +373,18 @@ static struct smr_sar_entry *smr_progress_sar(struct smr_cmd *cmd,
 	int next = 0;
 
 	sar_msg = smr_get_ptr(ep->region, cmd->msg.data.sar);
-	peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.addr);
+	peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id);
 	resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data);
 
 	memcpy(sar_iov, iov, sizeof(*iov) * iov_count);
 	(void) ofi_truncate_iov(sar_iov, &iov_count, cmd->msg.hdr.size);
 
 	if (cmd->msg.hdr.op == ofi_op_read_req)
-		smr_try_progress_to_sar(sar_msg, resp, cmd, sar_iov, iov_count,
-					total_len, &next);
+		smr_try_progress_to_sar(sar_msg, resp, cmd, iface, device,
+					sar_iov, iov_count, total_len, &next);
 	else
-		smr_try_progress_from_sar(sar_msg, resp, cmd, sar_iov, iov_count,
-					  total_len, &next);
+		smr_try_progress_from_sar(sar_msg, resp, cmd, iface, device,
+					  sar_iov, iov_count, total_len, &next);
 
 	if (*total_len == cmd->msg.hdr.size)
 		return NULL;
@@ -411,6 +404,9 @@ static struct smr_sar_entry *smr_progress_sar(struct smr_cmd *cmd,
 		sar_entry->rx_entry.flags = cmd->msg.hdr.op_flags;
 	}
 
+	sar_entry->iface = iface;
+	sar_entry->device = device;
+
 	dlist_insert_tail(&sar_entry->entry, &ep->sar_list);
 	*total_len = cmd->msg.hdr.size;
 	return sar_entry;
@@ -438,14 +434,17 @@ static void smr_do_atomic(void *src, void *dst, void *cmp, enum fi_datatype data
 {
 	char tmp_result[SMR_INJECT_SIZE];
 
-	if (op >= OFI_SWAP_OP_START) {
-		ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][datatype](dst,
-			src, cmp, tmp_result, cnt);
-	} else if (flags & SMR_RMA_REQ) {
-		ofi_atomic_readwrite_handlers[op][datatype](dst, src,
-			tmp_result, cnt);
-	} else if (op != FI_ATOMIC_READ) {
-		ofi_atomic_write_handlers[op][datatype](dst, src, cnt);
+	if (ofi_atomic_isswap_op(op)) {
+		ofi_atomic_swap_handler(op, datatype, dst, src, cmp,
+					tmp_result, cnt);
+	} else if (flags & SMR_RMA_REQ && ofi_atomic_isreadwrite_op(op)) {
+		ofi_atomic_readwrite_handler(op, datatype, dst, src,
+					     tmp_result, cnt);
+	} else if (ofi_atomic_iswrite_op(op)) {
+		ofi_atomic_write_handler(op, datatype, dst, src, cnt);
+	} else {
+		FI_WARN(&smr_prov, FI_LOG_EP_DATA,
+			"invalid atomic operation\n");
 	}
 
 	if (flags & SMR_RMA_REQ)
@@ -542,12 +541,14 @@ static int smr_progress_msg_common(struct smr_ep *ep, struct smr_cmd *cmd,
 
 	switch (cmd->msg.hdr.op_src) {
 	case smr_src_inline:
-		entry->err = smr_progress_inline(cmd, entry->iov, entry->iov_count,
+		entry->err = smr_progress_inline(cmd, entry->iface, entry->device,
+						 entry->iov, entry->iov_count,
 						 &total_len);
 		ep->region->cmd_cnt++;
 		break;
 	case smr_src_inject:
-		entry->err = smr_progress_inject(cmd, entry->iov, entry->iov_count,
+		entry->err = smr_progress_inject(cmd, entry->iface, entry->device,
+						 entry->iov, entry->iov_count,
 						 &total_len, ep, 0);
 		ep->region->cmd_cnt++;
 		break;
@@ -560,8 +561,8 @@ static int smr_progress_msg_common(struct smr_ep *ep, struct smr_cmd *cmd,
 					       &total_len, ep);
 		break;
 	case smr_src_sar:
-		sar = smr_progress_sar(cmd, entry, entry->iov, entry->iov_count,
-				       &total_len, ep);
+		sar = smr_progress_sar(cmd, entry, entry->iface, entry->device,
+				       entry->iov, entry->iov_count, &total_len, ep);
 		break;
 	default:
 		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
@@ -583,7 +584,7 @@ static int smr_progress_msg_common(struct smr_ep *ep, struct smr_cmd *cmd,
 
 	if (!sar) {
 		ret = smr_complete_rx(ep, entry->context, cmd->msg.hdr.op,
-				comp_flags, total_len, comp_buf, cmd->msg.hdr.addr,
+				comp_flags, total_len, comp_buf, cmd->msg.hdr.id,
 				cmd->msg.hdr.tag, cmd->msg.hdr.data, entry->err);
 		if (ret) {
 			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
@@ -599,6 +600,34 @@ static int smr_progress_msg_common(struct smr_ep *ep, struct smr_cmd *cmd,
 	return 0;
 }
 
+static void smr_progress_connreq(struct smr_ep *ep, struct smr_cmd *cmd)
+{
+	struct smr_region *peer_smr;
+	struct smr_inject_buf *tx_buf;
+	size_t inj_offset;
+	int64_t idx = -1;
+	int ret = 0;
+
+	inj_offset = (size_t) cmd->msg.hdr.src_data;
+	tx_buf = smr_get_ptr(ep->region, inj_offset);
+
+	ret = smr_map_add(&smr_prov, ep->region->map,
+			  (char *) tx_buf->data, &idx);
+	if (ret)
+		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
+			"Error processing mapping request\n");
+
+	peer_smr = smr_peer_region(ep->region, idx);
+
+	smr_peer_data(peer_smr)[cmd->msg.hdr.id].addr.id = idx;
+
+	smr_peer_data(ep->region)[idx].addr.id = cmd->msg.hdr.id;
+
+	smr_freestack_push(smr_inject_pool(ep->region), tx_buf);
+	ofi_cirque_discard(smr_cmd_queue(ep->region));
+	ep->region->cmd_cnt++;
+}
+
 static int smr_progress_cmd_msg(struct smr_ep *ep, struct smr_cmd *cmd)
 {
 	struct smr_queue *recv_queue;
@@ -616,7 +645,7 @@ static int smr_progress_cmd_msg(struct smr_ep *ep, struct smr_cmd *cmd)
 	recv_queue = (cmd->msg.hdr.op == ofi_op_tagged) ?
 		      &ep->trecv_queue : &ep->recv_queue;
 
-	match_attr.addr = cmd->msg.hdr.addr;
+	match_attr.id = cmd->msg.hdr.id;
 	match_attr.tag = cmd->msg.hdr.tag;
 
 	dlist_entry = dlist_find_first_match(&recv_queue->list,
@@ -652,6 +681,9 @@ static int smr_progress_cmd_rma(struct smr_ep *ep, struct smr_cmd *cmd)
 	size_t iov_count;
 	size_t total_len = 0;
 	int err = 0, ret = 0;
+	struct ofi_mr *mr;
+	enum fi_hmem_iface iface = FI_HMEM_SYSTEM;
+	uint64_t device = 0;
 
 	domain = container_of(ep->util_ep.domain, struct smr_domain,
 			      util_domain);
@@ -667,18 +699,28 @@ static int smr_progress_cmd_rma(struct smr_ep *ep, struct smr_cmd *cmd)
 	ep->region->cmd_cnt++;
 	rma_cmd = ofi_cirque_head(smr_cmd_queue(ep->region));
 
+	fastlock_acquire(&domain->util_domain.lock);
 	for (iov_count = 0; iov_count < rma_cmd->rma.rma_count; iov_count++) {
-		ret = ofi_mr_verify(&domain->util_domain.mr_map,
-				rma_cmd->rma.rma_iov[iov_count].len,
+		ret = ofi_mr_map_verify(&domain->util_domain.mr_map,
 				(uintptr_t *) &(rma_cmd->rma.rma_iov[iov_count].addr),
+				rma_cmd->rma.rma_iov[iov_count].len,
 				rma_cmd->rma.rma_iov[iov_count].key,
-				ofi_rx_mr_reg_flags(cmd->msg.hdr.op, 0));
+				ofi_rx_mr_reg_flags(cmd->msg.hdr.op, 0), (void **) &mr);
 		if (ret)
 			break;
 
 		iov[iov_count].iov_base = (void *) rma_cmd->rma.rma_iov[iov_count].addr;
 		iov[iov_count].iov_len = rma_cmd->rma.rma_iov[iov_count].len;
+
+		if (!iov_count) {
+			iface = mr->iface;
+			device = mr->device;
+		} else {
+			assert(mr->iface == iface && mr->device == device);
+		}
 	}
+	fastlock_release(&domain->util_domain.lock);
+
 	ofi_cirque_discard(smr_cmd_queue(ep->region));
 	if (ret) {
 		ep->region->cmd_cnt++;
@@ -687,13 +729,15 @@ static int smr_progress_cmd_rma(struct smr_ep *ep, struct smr_cmd *cmd)
 
 	switch (cmd->msg.hdr.op_src) {
 	case smr_src_inline:
-		err = smr_progress_inline(cmd, iov, iov_count, &total_len);
+		err = smr_progress_inline(cmd, iface, device, iov, iov_count,
+					  &total_len);
 		ep->region->cmd_cnt++;
 		break;
 	case smr_src_inject:
-		err = smr_progress_inject(cmd, iov, iov_count, &total_len, ep, ret);
+		err = smr_progress_inject(cmd, iface, device, iov, iov_count,
+					  &total_len, ep, ret);
 		if (cmd->msg.hdr.op == ofi_op_read_req && cmd->msg.hdr.data) {
-			peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.addr);
+			peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id);
 			resp = smr_get_ptr(peer_smr, cmd->msg.hdr.data);
 			resp->status = -err;
 		} else {
@@ -707,7 +751,8 @@ static int smr_progress_cmd_rma(struct smr_ep *ep, struct smr_cmd *cmd)
 		err = smr_progress_mmap(cmd, iov, iov_count, &total_len, ep);
 		break;
 	case smr_src_sar:
-		if (smr_progress_sar(cmd, NULL, iov, iov_count, &total_len, ep))
+		if (smr_progress_sar(cmd, NULL, iface, device, iov, iov_count,
+				     &total_len, ep))
 			return ret;
 		break;
 	default:
@@ -719,7 +764,7 @@ static int smr_progress_cmd_rma(struct smr_ep *ep, struct smr_cmd *cmd)
 	ret = smr_complete_rx(ep, (void *) cmd->msg.hdr.msg_id,
 			      cmd->msg.hdr.op, cmd->msg.hdr.op_flags,
 			      total_len, iov_count ? iov[0].iov_base : NULL,
-			      cmd->msg.hdr.addr, 0, cmd->msg.hdr.data, err);
+			      cmd->msg.hdr.id, 0, cmd->msg.hdr.data, err);
 	if (ret) {
 		FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
 		"unable to process rx completion\n");
@@ -779,7 +824,7 @@ static int smr_progress_cmd_atomic(struct smr_ep *ep, struct smr_cmd *cmd)
 		err = -FI_EINVAL;
 	}
 	if (cmd->msg.hdr.data) {
-		peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.addr);
+		peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id);
 		resp = smr_get_ptr(peer_smr, cmd->msg.hdr.data);
 		resp->status = -err;
 	} else {
@@ -792,11 +837,11 @@ static int smr_progress_cmd_atomic(struct smr_ep *ep, struct smr_cmd *cmd)
 
 	ret = smr_complete_rx(ep, NULL, cmd->msg.hdr.op, cmd->msg.hdr.op_flags,
 			      total_len, ioc_count ? ioc[0].addr : NULL,
-			      cmd->msg.hdr.addr, 0, cmd->msg.hdr.data, err);
+			      cmd->msg.hdr.id, 0, cmd->msg.hdr.data, err);
 	if (ret)
 		return ret;
 
-	return err; 
+	return err;
 }
 
 static void smr_progress_cmd(struct smr_ep *ep)
@@ -821,7 +866,8 @@ static void smr_progress_cmd(struct smr_ep *ep)
 			break;
 		case ofi_op_write_async:
 		case ofi_op_read_async:
-			ofi_ep_rx_cntr_inc_func(&ep->util_ep, cmd->msg.hdr.op);
+			ofi_ep_rx_cntr_inc_func(&ep->util_ep,
+						cmd->msg.hdr.op);
 			ofi_cirque_discard(smr_cmd_queue(ep->region));
 			ep->region->cmd_cnt++;
 			break;
@@ -830,12 +876,14 @@ static void smr_progress_cmd(struct smr_ep *ep)
 		case ofi_op_atomic_compare:
 			ret = smr_progress_cmd_atomic(ep, cmd);
 			break;
+		case SMR_OP_MAX + ofi_ctrl_connreq:
+			smr_progress_connreq(ep, cmd);
+			break;
 		default:
 			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
 				"unidentified operation type\n");
 			ret = -FI_EINVAL;
 		}
-
 		if (ret) {
 			if (ret != -FI_EAGAIN) {
 				FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
@@ -856,21 +904,23 @@ static void smr_progress_sar_list(struct smr_ep *ep)
 	struct smr_resp *resp;
 	struct dlist_entry *tmp;
 	int ret;
- 
+
 	fastlock_acquire(&ep->region->lock);
 	fastlock_acquire(&ep->util_ep.rx_cq->cq_lock);
 
 	dlist_foreach_container_safe(&ep->sar_list, struct smr_sar_entry,
 				     sar_entry, entry, tmp) {
 		sar_msg = smr_get_ptr(ep->region, sar_entry->cmd.msg.data.sar);
-		peer_smr = smr_peer_region(ep->region, sar_entry->cmd.msg.hdr.addr);
+		peer_smr = smr_peer_region(ep->region, sar_entry->cmd.msg.hdr.id);
 		resp = smr_get_ptr(peer_smr, sar_entry->cmd.msg.hdr.src_data);
 		if (sar_entry->cmd.msg.hdr.op == ofi_op_read_req)
 			smr_try_progress_to_sar(sar_msg, resp, &sar_entry->cmd,
+					sar_entry->iface, sar_entry->device,
 					sar_entry->iov, sar_entry->iov_count,
 					&sar_entry->bytes_done, &sar_entry->next);
 		else
 			smr_try_progress_from_sar(sar_msg, resp, &sar_entry->cmd,
+					sar_entry->iface, sar_entry->device,
 					sar_entry->iov, sar_entry->iov_count,
 					&sar_entry->bytes_done, &sar_entry->next);
 
@@ -880,7 +930,7 @@ static void smr_progress_sar_list(struct smr_ep *ep)
 					sar_entry->rx_entry.flags,
 					sar_entry->bytes_done,
 					sar_entry->rx_entry.iov[0].iov_base,
-					sar_entry->cmd.msg.hdr.addr,
+					sar_entry->cmd.msg.hdr.id,
 					sar_entry->cmd.msg.hdr.tag,
 					sar_entry->cmd.msg.hdr.data, 0);
 			if (ret) {
@@ -904,8 +954,7 @@ void smr_ep_progress(struct util_ep *util_ep)
 	smr_progress_resp(ep);
 	smr_progress_cmd(ep);
 
-	if (ep->region->cma_cap == SMR_CMA_CAP_OFF)
-		smr_progress_sar_list(ep);
+	smr_progress_sar_list(ep);
 }
 
 int smr_progress_unexp_queue(struct smr_ep *ep, struct smr_rx_entry *entry,
@@ -917,7 +966,7 @@ int smr_progress_unexp_queue(struct smr_ep *ep, struct smr_rx_entry *entry,
 	int multi_recv;
 	int ret;
 
-	match_attr.addr = entry->addr;
+	match_attr.id = entry->peer_id;
 	match_attr.ignore = entry->ignore;
 	match_attr.tag = entry->tag;
 
diff --git a/deps/libfabric/prov/shm/src/smr_rma.c b/deps/libfabric/prov/shm/src/smr_rma.c
index 95ceb6d72f3d4ec230bbc942bdc3ef4f42d3dab9..b41ad8887c21aef5dfaf0af15b1c2808c287c09a 100644
--- a/deps/libfabric/prov/shm/src/smr_rma.c
+++ b/deps/libfabric/prov/shm/src/smr_rma.c
@@ -59,10 +59,11 @@ ssize_t smr_rma_fast(struct smr_region *peer_smr, struct smr_cmd *cmd,
 		     void **desc, int peer_id, void *context, uint32_t op,
 		     uint64_t op_flags)
 {
-	struct iovec rma_iovec[SMR_IOV_LIMIT];
+	struct iovec cma_iovec[SMR_IOV_LIMIT], rma_iovec[SMR_IOV_LIMIT];
 	size_t total_len;
 	int ret, i;
 
+	memcpy(cma_iovec, iov, sizeof(*iov) * iov_count);
 	for (i = 0; i < rma_count; i++) {
 		rma_iovec[i].iov_base = (void *) rma_iov[i].addr;
 		rma_iovec[i].iov_len = rma_iov[i].len;
@@ -70,26 +71,11 @@ ssize_t smr_rma_fast(struct smr_region *peer_smr, struct smr_cmd *cmd,
 
 	total_len = ofi_total_iov_len(iov, iov_count);
 
-	if (op == ofi_op_write) {
-		ret = ofi_process_vm_writev(peer_smr->pid, iov, iov_count,
-					    rma_iovec, rma_count, 0);
-	} else {
-		ret = ofi_process_vm_readv(peer_smr->pid, iov, iov_count,
-					   rma_iovec, rma_count, 0);
-	}
+	ret = smr_cma_loop(peer_smr->pid, cma_iovec, iov_count, rma_iovec,
+			   rma_count, 0, total_len, op == ofi_op_write);
 
-	if (ret != total_len) {
-		if (ret < 0) {
-			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
-				"CMA write error\n");
-			ret = -errno;
-		} else {
-			FI_WARN(&smr_prov, FI_LOG_EP_CTRL,
-				"unable to process tx completion\n");
-			ret = -FI_EIO;
-		}
+	if (ret)
 		return ret;
-	}
 
 	smr_format_rma_resp(cmd, peer_id, rma_iov, rma_count, total_len,
 			    (op == ofi_op_write) ? ofi_op_write_async :
@@ -110,7 +96,10 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov,
 	struct smr_resp *resp;
 	struct smr_cmd *cmd;
 	struct smr_tx_entry *pend;
-	int id, peer_id, cmds, err = 0, comp = 1;
+	enum fi_hmem_iface iface;
+	uint64_t device;
+	int64_t id, peer_id;
+	int cmds, err = 0, comp = 1;
 	uint16_t comp_flags;
 	ssize_t ret = 0;
 	size_t total_len;
@@ -120,19 +109,17 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov,
 
 	domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain);
 
-	id = (int) addr;
-	peer_id = smr_peer_data(ep->region)[id].addr.addr;
+	id = smr_verify_peer(ep, addr);
+	if (id < 0)
+		return -FI_EAGAIN;
 
-	ret = smr_verify_peer(ep, id);
-	if (ret)
-		return ret;
+	peer_id = smr_peer_data(ep->region)[id].addr.id;
+	peer_smr = smr_peer_region(ep->region, id);
 
 	cmds = 1 + !(domain->fast_rma && !(op_flags &
 		    (FI_REMOTE_CQ_DATA | FI_DELIVERY_COMPLETE)) &&
-		     rma_count == 1 &&
-		     ep->region->cma_cap == SMR_CMA_CAP_ON);
+		     rma_count == 1 && smr_cma_enabled(ep, peer_smr));
 
-	peer_smr = smr_peer_region(ep->region, id);
 	fastlock_acquire(&peer_smr->lock);
 	if (peer_smr->cmd_cnt < cmds ||
 	    smr_peer_data(ep->region)[id].sar_status) {
@@ -156,16 +143,18 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov,
 		goto commit_comp;
 	}
 
+	iface = smr_get_mr_hmem_iface(ep->util_ep.domain, desc, &device);
+
 	total_len = ofi_total_iov_len(iov, iov_count);
 
 	smr_generic_format(cmd, peer_id, op, 0, data, op_flags);
 	if (total_len <= SMR_MSG_DATA_LEN && op == ofi_op_write &&
 	    !(op_flags & FI_DELIVERY_COMPLETE)) {
-		smr_format_inline(cmd, iov, iov_count);
+		smr_format_inline(cmd, iface, device, iov, iov_count);
 	} else if (total_len <= SMR_INJECT_SIZE &&
 		   !(op_flags & FI_DELIVERY_COMPLETE)) {
 		tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr));
-		smr_format_inject(cmd, iov, iov_count, peer_smr, tx_buf);
+		smr_format_inject(cmd, iface, device, iov, iov_count, peer_smr, tx_buf);
 		if (op == ofi_op_read_req) {
 			if (ofi_cirque_isfull(smr_resp_queue(ep->region))) {
 				smr_freestack_push(smr_inject_pool(peer_smr), tx_buf);
@@ -175,7 +164,7 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov,
 			cmd->msg.hdr.op_flags |= SMR_RMA_REQ;
 			resp = ofi_cirque_tail(smr_resp_queue(ep->region));
 			pend = freestack_pop(ep->pend_fs);
-			smr_format_pend_resp(pend, cmd, context, iov,
+			smr_format_pend_resp(pend, cmd, context, iface, device, iov,
 					     iov_count, id, resp);
 			cmd->msg.hdr.data = smr_get_offset(ep->region, resp);
 			ofi_cirque_commit(smr_resp_queue(ep->region));
@@ -188,15 +177,18 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov,
 		}
 		resp = ofi_cirque_tail(smr_resp_queue(ep->region));
 		pend = freestack_pop(ep->pend_fs);
-		if (ep->region->cma_cap == SMR_CMA_CAP_ON) {
-			smr_format_iov(cmd, iov, iov_count, total_len, ep->region, resp);
+		if (smr_cma_enabled(ep, peer_smr) && iface == FI_HMEM_SYSTEM) {
+			smr_format_iov(cmd, iov, iov_count, total_len, ep->region,
+				       resp);
 		} else {
-			if (total_len <= smr_env.sar_threshold) {
+			if (total_len <= smr_env.sar_threshold ||
+			    iface != FI_HMEM_SYSTEM) {
 				if (!peer_smr->sar_cnt) {
 					ret = -FI_EAGAIN;
 				} else {
 					sar = smr_freestack_pop(smr_sar_pool(peer_smr));
-					smr_format_sar(cmd, iov, iov_count, total_len,
+					smr_format_sar(cmd, iface, device, iov,
+						       iov_count, total_len,
 						       ep->region, peer_smr, sar,
 						       pend, resp);
 					peer_smr->sar_cnt--;
@@ -212,7 +204,8 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov,
 				goto unlock_cq;
 			}
 		}
-		smr_format_pend_resp(pend, cmd, context, iov, iov_count, id, resp);
+		smr_format_pend_resp(pend, cmd, context, iface, device, iov,
+				     iov_count, id, resp);
 		ofi_cirque_commit(smr_resp_queue(ep->region));
 		comp = 0;
 	}
@@ -359,24 +352,24 @@ ssize_t smr_generic_rma_inject(struct fid_ep *ep_fid, const void *buf,
 	struct smr_cmd *cmd;
 	struct iovec iov;
 	struct fi_rma_iov rma_iov;
-	int id, peer_id, cmds;
+	int64_t id, peer_id;
+	int cmds;
 	ssize_t ret = 0;
 
 	assert(len <= SMR_INJECT_SIZE);
 	ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid);
 	domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain);
 
-	id = (int) dest_addr;
-	peer_id = smr_peer_data(ep->region)[id].addr.addr;
+	id = smr_verify_peer(ep, dest_addr);
+	if (id < 0)
+		return -FI_EAGAIN;
 
-	ret = smr_verify_peer(ep, id);
-	if (ret)
-		return ret;
+	peer_id = smr_peer_data(ep->region)[id].addr.id;
+	peer_smr = smr_peer_region(ep->region, id);
 
 	cmds = 1 + !(domain->fast_rma && !(flags & FI_REMOTE_CQ_DATA) &&
-		     ep->region->cma_cap == SMR_CMA_CAP_ON);
+		     smr_cma_enabled(ep, peer_smr));
 
-	peer_smr = smr_peer_region(ep->region, id);
 	fastlock_acquire(&peer_smr->lock);
 	if (peer_smr->cmd_cnt < cmds ||
 	    smr_peer_data(ep->region)[id].sar_status) {
@@ -402,10 +395,11 @@ ssize_t smr_generic_rma_inject(struct fid_ep *ep_fid, const void *buf,
 
 	smr_generic_format(cmd, peer_id, ofi_op_write, 0, data, flags);
 	if (len <= SMR_MSG_DATA_LEN) {
-		smr_format_inline(cmd, &iov, 1);
+		smr_format_inline(cmd, FI_HMEM_SYSTEM, 0, &iov, 1);
 	} else {
 		tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr));
-		smr_format_inject(cmd, &iov, 1, peer_smr, tx_buf);
+		smr_format_inject(cmd, FI_HMEM_SYSTEM, 0, &iov, 1,
+				  peer_smr, tx_buf);
 	}
 
 	ofi_cirque_commit(smr_cmd_queue(peer_smr));
diff --git a/deps/libfabric/prov/shm/src/smr_signal.h b/deps/libfabric/prov/shm/src/smr_signal.h
index 67f6d285f4e4bd453655f9d2b6467b0aa5134cb1..35c46aee098e6ac8a4489ab2d02833e4000c0fae 100644
--- a/deps/libfabric/prov/shm/src/smr_signal.h
+++ b/deps/libfabric/prov/shm/src/smr_signal.h
@@ -53,8 +53,15 @@ static void smr_handle_signal(int signum, siginfo_t *info, void *ucontext)
 	if (ret)
 		return;
 
-	/* Raise signum to execute the original handler */
-	raise(signum);
+	/* call the original handler */
+	if (old_action[signum].sa_flags & SA_SIGINFO)
+		old_action[signum].sa_sigaction(signum, info, ucontext);
+	else if (old_action[signum].sa_handler == SIG_DFL ||
+		 old_action[signum].sa_handler == SIG_IGN)
+		return;
+	else
+		old_action[signum].sa_handler(signum);
+
 }
 
 static void smr_reg_sig_hander(int signum)
@@ -64,7 +71,7 @@ static void smr_reg_sig_hander(int signum)
 
 	memset(&action, 0, sizeof(action));
 	action.sa_sigaction = smr_handle_signal;
-	action.sa_flags |= SA_SIGINFO;
+	action.sa_flags |= SA_SIGINFO | SA_ONSTACK;
 
 	ret = sigaction(signum, &action, &old_action[signum]);
 	if (ret)
diff --git a/deps/libfabric/prov/sockets/include/sock.h b/deps/libfabric/prov/sockets/include/sock.h
index ab58605b299f20601e9fb71ddb3c7db877d47960..06273c9572e8bc95c0b93ca37f116b87080718e7 100644
--- a/deps/libfabric/prov/sockets/include/sock.h
+++ b/deps/libfabric/prov/sockets/include/sock.h
@@ -205,20 +205,22 @@ struct sock_conn_map {
 };
 
 struct sock_conn_listener {
-	ofi_epoll_t emap;
+	ofi_epoll_t epollfd;
 	struct fd_signal signal;
 	fastlock_t signal_lock; /* acquire before map lock */
 	pthread_t listener_thread;
 	int do_listen;
+	bool removed_from_epollfd;
 };
 
 struct sock_ep_cm_head {
-	ofi_epoll_t emap;
+	ofi_epoll_t epollfd;
 	struct fd_signal signal;
-	fastlock_t signal_lock;
+	pthread_mutex_t signal_lock;
 	pthread_t listener_thread;
 	struct dlist_entry msg_list;
 	int do_listen;
+	bool removed_from_epollfd;
 };
 
 struct sock_domain {
@@ -654,6 +656,8 @@ struct sock_rx_ctx {
 	struct dlist_entry ep_list;
 	fastlock_t lock;
 
+	struct dlist_entry *progress_start;
+
 	struct fi_rx_attr attr;
 	struct sock_rx_entry *rx_entry_pool;
 	struct slist pool_list;
@@ -886,8 +890,8 @@ struct sock_cq {
 	struct ofi_ringbuffd cq_rbfd;
 	struct ofi_ringbuf cqerr_rb;
 	struct dlist_entry overflow_list;
-	fastlock_t lock;
-	fastlock_t list_lock;
+	pthread_mutex_t lock;
+	pthread_mutex_t list_lock;
 
 	struct fid_wait *waitset;
 	int signal;
@@ -1216,7 +1220,6 @@ static inline size_t sock_rx_avail_len(struct sock_rx_entry *rx_entry)
 
 int sock_ep_cm_start_thread(struct sock_ep_cm_head *cm_head);
 void sock_ep_cm_signal(struct sock_ep_cm_head *cm_head);
-void sock_ep_cm_signal_locked(struct sock_ep_cm_head *cm_head);
 void sock_ep_cm_stop_thread(struct sock_ep_cm_head *cm_head);
 void sock_ep_cm_wait_handle_finalized(struct sock_ep_cm_head *cm_head,
                                       struct sock_conn_req_handle *handle);
diff --git a/deps/libfabric/prov/sockets/src/sock_av.c b/deps/libfabric/prov/sockets/src/sock_av.c
index e33e3b883cc6765032237fad2e26589ea88f589f..05d3972a1513f5682b22cc8801e625c5c64fa599 100644
--- a/deps/libfabric/prov/sockets/src/sock_av.c
+++ b/deps/libfabric/prov/sockets/src/sock_av.c
@@ -441,10 +441,12 @@ static int sock_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count,
 	}
 	fastlock_release(&_av->list_lock);
 
+	fastlock_acquire(&_av->table_lock);
 	for (i = 0; i < count; i++) {
 		av_addr = &_av->table[fi_addr[i]];
 		av_addr->valid = 0;
 	}
+	fastlock_release(&_av->table_lock);
 
 	return 0;
 }
diff --git a/deps/libfabric/prov/sockets/src/sock_conn.c b/deps/libfabric/prov/sockets/src/sock_conn.c
index 8c739db74f29ce311246c37d26c5e482f69a8f05..0d39956a82557d329497c531e633256b434cc11c 100644
--- a/deps/libfabric/prov/sockets/src/sock_conn.c
+++ b/deps/libfabric/prov/sockets/src/sock_conn.c
@@ -167,8 +167,9 @@ void sock_conn_release_entry(struct sock_conn_map *map, struct sock_conn *conn)
 	ofi_close_socket(conn->sock_fd);
 
 	conn->address_published = 0;
-        conn->connected = 0;
-        conn->sock_fd = -1;
+	conn->av_index = FI_ADDR_NOTAVAIL;
+	conn->connected = 0;
+	conn->sock_fd = -1;
 }
 
 static int sock_conn_get_next_index(struct sock_conn_map *map)
@@ -306,7 +307,7 @@ int sock_conn_stop_listener_thread(struct sock_conn_listener *conn_listener)
 	}
 
 	fd_signal_free(&conn_listener->signal);
-	ofi_epoll_close(conn_listener->emap);
+	ofi_epoll_close(conn_listener->epollfd);
 	fastlock_destroy(&conn_listener->signal_lock);
 
 	return 0;
@@ -323,7 +324,7 @@ static void *sock_conn_listener_thread(void *arg)
 	socklen_t addr_size;
 
 	while (conn_listener->do_listen) {
-		num_fds = ofi_epoll_wait(conn_listener->emap, ep_contexts,
+		num_fds = ofi_epoll_wait(conn_listener->epollfd, ep_contexts,
 		                        SOCK_EPOLL_WAIT_EVENTS, -1);
 		if (num_fds < 0) {
 			SOCK_LOG_ERROR("poll failed : %s\n", strerror(errno));
@@ -331,6 +332,15 @@ static void *sock_conn_listener_thread(void *arg)
 		}
 
 		fastlock_acquire(&conn_listener->signal_lock);
+		if (conn_listener->removed_from_epollfd) {
+			/* The epoll set changed between calling wait and wait
+			 * returning.  Get an updated set of events to avoid
+			 * possible use after free error.
+			 */
+			conn_listener->removed_from_epollfd = false;
+			goto skip;
+		}
+
 		for (i = 0; i < num_fds; i++) {
 			conn_handle = ep_contexts[i];
 
@@ -359,6 +369,7 @@ static void *sock_conn_listener_thread(void *arg)
 			fastlock_release(&ep_attr->cmap.lock);
 			sock_pe_signal(ep_attr->domain->pe);
 		}
+skip:
 		fastlock_release(&conn_listener->signal_lock);
 	}
 
@@ -371,7 +382,7 @@ int sock_conn_start_listener_thread(struct sock_conn_listener *conn_listener)
 
 	fastlock_init(&conn_listener->signal_lock);
 
-	ret = ofi_epoll_create(&conn_listener->emap);
+	ret = ofi_epoll_create(&conn_listener->epollfd);
 	if (ret < 0) {
 		SOCK_LOG_ERROR("failed to create epoll set\n");
 		goto err1;
@@ -383,7 +394,7 @@ int sock_conn_start_listener_thread(struct sock_conn_listener *conn_listener)
 		goto err2;
 	}
 
-	ret = ofi_epoll_add(conn_listener->emap,
+	ret = ofi_epoll_add(conn_listener->epollfd,
 	                   conn_listener->signal.fd[FI_READ_FD],
 	                   OFI_EPOLL_IN, NULL);
 	if (ret != 0){
@@ -392,6 +403,7 @@ int sock_conn_start_listener_thread(struct sock_conn_listener *conn_listener)
 	}
 
 	conn_listener->do_listen = 1;
+	conn_listener->removed_from_epollfd = false;
 	ret = pthread_create(&conn_listener->listener_thread, NULL,
 	                     sock_conn_listener_thread, conn_listener);
 	if (ret < 0) {
@@ -404,7 +416,7 @@ err3:
 	conn_listener->do_listen = 0;
 	fd_signal_free(&conn_listener->signal);
 err2:
-	ofi_epoll_close(conn_listener->emap);
+	ofi_epoll_close(conn_listener->epollfd);
 err1:
 	fastlock_destroy(&conn_listener->signal_lock);
 	return ret;
@@ -463,7 +475,7 @@ int sock_conn_listen(struct sock_ep_attr *ep_attr)
 	conn_handle->do_listen = 1;
 
 	fastlock_acquire(&ep_attr->domain->conn_listener.signal_lock);
-	ret = ofi_epoll_add(ep_attr->domain->conn_listener.emap,
+	ret = ofi_epoll_add(ep_attr->domain->conn_listener.epollfd,
 	                   conn_handle->sock, OFI_EPOLL_IN, conn_handle);
 	fd_signal_set(&ep_attr->domain->conn_listener.signal);
 	fastlock_release(&ep_attr->domain->conn_listener.signal_lock);
@@ -581,6 +593,8 @@ retry:
 
 	SOCK_LOG_ERROR("Connect error, retrying - %s - %d\n",
 		       strerror(ofi_sockerr()), conn_fd);
+	ofi_straddr_log(&sock_prov, FI_LOG_WARN, FI_LOG_EP_CTRL,
+			"Retry connect to peer ", &addr.sa);
         goto do_connect;
 
 out:
diff --git a/deps/libfabric/prov/sockets/src/sock_cq.c b/deps/libfabric/prov/sockets/src/sock_cq.c
index 5a3b137cf0299a0dac72bcd5bd07346b0e4c73c6..6f893fa39a9e11a67ce6c87abdda979e7ab85f93 100644
--- a/deps/libfabric/prov/sockets/src/sock_cq.c
+++ b/deps/libfabric/prov/sockets/src/sock_cq.c
@@ -55,7 +55,7 @@ void sock_cq_add_tx_ctx(struct sock_cq *cq, struct sock_tx_ctx *tx_ctx)
 {
 	struct dlist_entry *entry;
 	struct sock_tx_ctx *curr_ctx;
-	fastlock_acquire(&cq->list_lock);
+	pthread_mutex_lock(&cq->list_lock);
 	for (entry = cq->tx_list.next; entry != &cq->tx_list;
 	     entry = entry->next) {
 		curr_ctx = container_of(entry, struct sock_tx_ctx, cq_entry);
@@ -65,22 +65,22 @@ void sock_cq_add_tx_ctx(struct sock_cq *cq, struct sock_tx_ctx *tx_ctx)
 	dlist_insert_tail(&tx_ctx->cq_entry, &cq->tx_list);
 	ofi_atomic_inc32(&cq->ref);
 out:
-	fastlock_release(&cq->list_lock);
+	pthread_mutex_unlock(&cq->list_lock);
 }
 
 void sock_cq_remove_tx_ctx(struct sock_cq *cq, struct sock_tx_ctx *tx_ctx)
 {
-	fastlock_acquire(&cq->list_lock);
+	pthread_mutex_lock(&cq->list_lock);
 	dlist_remove(&tx_ctx->cq_entry);
 	ofi_atomic_dec32(&cq->ref);
-	fastlock_release(&cq->list_lock);
+	pthread_mutex_unlock(&cq->list_lock);
 }
 
 void sock_cq_add_rx_ctx(struct sock_cq *cq, struct sock_rx_ctx *rx_ctx)
 {
 	struct dlist_entry *entry;
 	struct sock_rx_ctx *curr_ctx;
-	fastlock_acquire(&cq->list_lock);
+	pthread_mutex_lock(&cq->list_lock);
 
 	for (entry = cq->rx_list.next; entry != &cq->rx_list;
 	     entry = entry->next) {
@@ -91,15 +91,15 @@ void sock_cq_add_rx_ctx(struct sock_cq *cq, struct sock_rx_ctx *rx_ctx)
 	dlist_insert_tail(&rx_ctx->cq_entry, &cq->rx_list);
 	ofi_atomic_inc32(&cq->ref);
 out:
-	fastlock_release(&cq->list_lock);
+	pthread_mutex_unlock(&cq->list_lock);
 }
 
 void sock_cq_remove_rx_ctx(struct sock_cq *cq, struct sock_rx_ctx *rx_ctx)
 {
-	fastlock_acquire(&cq->list_lock);
+	pthread_mutex_lock(&cq->list_lock);
 	dlist_remove(&rx_ctx->cq_entry);
 	ofi_atomic_dec32(&cq->ref);
-	fastlock_release(&cq->list_lock);
+	pthread_mutex_unlock(&cq->list_lock);
 }
 
 int sock_cq_progress(struct sock_cq *cq)
@@ -111,7 +111,7 @@ int sock_cq_progress(struct sock_cq *cq)
 	if (cq->domain->progress_mode == FI_PROGRESS_AUTO)
 		return 0;
 
-	fastlock_acquire(&cq->list_lock);
+	pthread_mutex_lock(&cq->list_lock);
 	for (entry = cq->tx_list.next; entry != &cq->tx_list;
 	     entry = entry->next) {
 		tx_ctx = container_of(entry, struct sock_tx_ctx, cq_entry);
@@ -135,7 +135,7 @@ int sock_cq_progress(struct sock_cq *cq)
 		else
 			sock_pe_progress_ep_rx(cq->domain->pe, rx_ctx->ep_attr);
 	}
-	fastlock_release(&cq->list_lock);
+	pthread_mutex_unlock(&cq->list_lock);
 
 	return 0;
 }
@@ -176,7 +176,7 @@ static ssize_t _sock_cq_write(struct sock_cq *cq, fi_addr_t addr,
 	ssize_t ret;
 	struct sock_cq_overflow_entry_t *overflow_entry;
 
-	fastlock_acquire(&cq->lock);
+	pthread_mutex_lock(&cq->lock);
 	if (ofi_rbfdavail(&cq->cq_rbfd) < len) {
 		SOCK_LOG_ERROR("Not enough space in CQ\n");
 		overflow_entry = calloc(1, sizeof(*overflow_entry) + len);
@@ -208,7 +208,7 @@ static ssize_t _sock_cq_write(struct sock_cq *cq, fi_addr_t addr,
 	if (cq->signal)
 		sock_wait_signal(cq->waitset);
 out:
-	fastlock_release(&cq->lock);
+	pthread_mutex_unlock(&cq->lock);
 	return ret;
 }
 
@@ -354,14 +354,14 @@ static ssize_t sock_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count,
 	if (sock_cq->domain->progress_mode == FI_PROGRESS_MANUAL) {
 		while (1) {
 			sock_cq_progress(sock_cq);
-			fastlock_acquire(&sock_cq->lock);
+			pthread_mutex_lock(&sock_cq->lock);
 			avail = ofi_rbfdused(&sock_cq->cq_rbfd);
 			if (avail) {
 				ret = sock_cq_rbuf_read(sock_cq, buf,
 					MIN(threshold, (size_t)(avail / cq_entry_len)),
 					src_addr, cq_entry_len);
 			}
-			fastlock_release(&sock_cq->lock);
+			pthread_mutex_unlock(&sock_cq->lock);
 			if (ret)
 				return ret;
 
@@ -378,7 +378,7 @@ static ssize_t sock_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count,
 		};
 	} else {
 		do {
-			fastlock_acquire(&sock_cq->lock);
+			pthread_mutex_lock(&sock_cq->lock);
 			ret = 0;
 			avail = ofi_rbfdused(&sock_cq->cq_rbfd);
 			if (avail) {
@@ -388,7 +388,7 @@ static ssize_t sock_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count,
 			} else {
 				ofi_rbfdreset(&sock_cq->cq_rbfd);
 			}
-			fastlock_release(&sock_cq->lock);
+			pthread_mutex_unlock(&sock_cq->lock);
 			if (ret && ret != -FI_EAGAIN)
 				return ret;
 
@@ -440,7 +440,7 @@ static ssize_t sock_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf,
 	if (sock_cq->domain->progress_mode == FI_PROGRESS_MANUAL)
 		sock_cq_progress(sock_cq);
 
-	fastlock_acquire(&sock_cq->lock);
+	pthread_mutex_lock(&sock_cq->lock);
 	if (ofi_rbused(&sock_cq->cqerr_rb) >= sizeof(struct fi_cq_err_entry)) {
 		api_version = sock_cq->domain->fab->fab_fid.api_version;
 		ofi_rbread(&sock_cq->cqerr_rb, &entry, sizeof(entry));
@@ -463,7 +463,7 @@ static ssize_t sock_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf,
 	} else {
 		ret = -FI_EAGAIN;
 	}
-	fastlock_release(&sock_cq->lock);
+	pthread_mutex_unlock(&sock_cq->lock);
 	return ret;
 }
 
@@ -490,8 +490,8 @@ static int sock_cq_close(struct fid *fid)
 	ofi_rbfree(&cq->cqerr_rb);
 	ofi_rbfdfree(&cq->cq_rbfd);
 
-	fastlock_destroy(&cq->lock);
-	fastlock_destroy(&cq->list_lock);
+	pthread_mutex_destroy(&cq->lock);
+	pthread_mutex_destroy(&cq->list_lock);
 	ofi_atomic_dec32(&cq->domain->ref);
 
 	free(cq);
@@ -504,9 +504,9 @@ static int sock_cq_signal(struct fid_cq *cq)
 	sock_cq = container_of(cq, struct sock_cq, cq_fid);
 
 	ofi_atomic_set32(&sock_cq->signaled, 1);
-	fastlock_acquire(&sock_cq->lock);
+	pthread_mutex_lock(&sock_cq->lock);
 	ofi_rbfdsignal(&sock_cq->cq_rbfd);
-	fastlock_release(&sock_cq->lock);
+	pthread_mutex_unlock(&sock_cq->lock);
 	return 0;
 }
 
@@ -668,7 +668,7 @@ int sock_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
 	if (ret)
 		goto err3;
 
-	fastlock_init(&sock_cq->lock);
+	pthread_mutex_init(&sock_cq->lock, NULL);
 
 	switch (sock_cq->attr.wait_obj) {
 	case FI_WAIT_NONE:
@@ -713,7 +713,7 @@ int sock_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
 
 	*cq = &sock_cq->cq_fid;
 	ofi_atomic_inc32(&sock_dom->ref);
-	fastlock_init(&sock_cq->list_lock);
+	pthread_mutex_init(&sock_cq->list_lock, NULL);
 
 	return 0;
 
@@ -735,7 +735,7 @@ int sock_cq_report_error(struct sock_cq *cq, struct sock_pe_entry *entry,
 	int ret;
 	struct fi_cq_err_entry err_entry;
 
-	fastlock_acquire(&cq->lock);
+	pthread_mutex_lock(&cq->lock);
 	if (ofi_rbavail(&cq->cqerr_rb) < sizeof(err_entry)) {
 		ret = -FI_ENOSPC;
 		goto out;
@@ -764,6 +764,6 @@ int sock_cq_report_error(struct sock_cq *cq, struct sock_pe_entry *entry,
 	ofi_rbfdsignal(&cq->cq_rbfd);
 
 out:
-	fastlock_release(&cq->lock);
+	pthread_mutex_unlock(&cq->lock);
 	return ret;
 }
diff --git a/deps/libfabric/prov/sockets/src/sock_ctx.c b/deps/libfabric/prov/sockets/src/sock_ctx.c
index 6d051bd5c2cbdd2de73b2fb9ea06d96cbd47a5ba..30fe13dd5159090a42e94f41571cf07fe1f3fb30 100644
--- a/deps/libfabric/prov/sockets/src/sock_ctx.c
+++ b/deps/libfabric/prov/sockets/src/sock_ctx.c
@@ -57,6 +57,8 @@ struct sock_rx_ctx *sock_rx_ctx_alloc(const struct fi_rx_attr *attr,
 	dlist_init(&rx_ctx->rx_buffered_list);
 	dlist_init(&rx_ctx->ep_list);
 
+	rx_ctx->progress_start = &rx_ctx->rx_buffered_list;
+
 	fastlock_init(&rx_ctx->lock);
 
 	rx_ctx->ctx.fid.fclass = FI_CLASS_RX_CTX;
@@ -69,6 +71,15 @@ struct sock_rx_ctx *sock_rx_ctx_alloc(const struct fi_rx_attr *attr,
 
 void sock_rx_ctx_free(struct sock_rx_ctx *rx_ctx)
 {
+	struct sock_rx_entry *rx_buffered;
+
+	/* free any remaining buffered entries */
+	while (!dlist_empty(&rx_ctx->rx_buffered_list)) {
+		dlist_pop_front(&rx_ctx->rx_buffered_list,
+		                struct sock_rx_entry, rx_buffered, entry);
+		free(rx_buffered);
+	}
+
 	fastlock_destroy(&rx_ctx->lock);
 	free(rx_ctx->rx_entry_pool);
 	free(rx_ctx);
diff --git a/deps/libfabric/prov/sockets/src/sock_ep.c b/deps/libfabric/prov/sockets/src/sock_ep.c
index b11b982672a7aab052d601a1678d569d853f7769..9f5145d8423ba2b0e17f7d425f688478ee5cbf68 100644
--- a/deps/libfabric/prov/sockets/src/sock_ep.c
+++ b/deps/libfabric/prov/sockets/src/sock_ep.c
@@ -680,8 +680,9 @@ static int sock_ep_close(struct fid *fid)
 
 	if (sock_ep->attr->conn_handle.do_listen) {
 		fastlock_acquire(&sock_ep->attr->domain->conn_listener.signal_lock);
-		ofi_epoll_del(sock_ep->attr->domain->conn_listener.emap,
+		ofi_epoll_del(sock_ep->attr->domain->conn_listener.epollfd,
 		             sock_ep->attr->conn_handle.sock);
+		sock_ep->attr->domain->conn_listener.removed_from_epollfd = true;
 		fastlock_release(&sock_ep->attr->domain->conn_listener.signal_lock);
 		ofi_close_socket(sock_ep->attr->conn_handle.sock);
 		sock_ep->attr->conn_handle.do_listen = 0;
@@ -1758,8 +1759,7 @@ int sock_alloc_endpoint(struct fid_domain *domain, struct fi_info *info,
 	/* default config */
 	sock_ep->attr->min_multi_recv = SOCK_EP_MIN_MULTI_RECV;
 
-	if (info)
-		memcpy(&sock_ep->attr->info, info, sizeof(struct fi_info));
+	memcpy(&sock_ep->attr->info, info, sizeof(struct fi_info));
 
 	sock_ep->attr->domain = sock_dom;
 	fastlock_init(&sock_ep->attr->cm.lock);
@@ -1786,6 +1786,8 @@ err1:
 
 void sock_ep_remove_conn(struct sock_ep_attr *attr, struct sock_conn *conn)
 {
+	if (attr->cmap.used <= 0 || conn->sock_fd == -1)
+		return;
 	sock_pe_poll_del(attr->domain->pe, conn->sock_fd);
 	sock_conn_release_entry(&attr->cmap, conn);
 }
@@ -1794,14 +1796,27 @@ struct sock_conn *sock_ep_lookup_conn(struct sock_ep_attr *attr, fi_addr_t index
 				      union ofi_sock_ip *addr)
 {
 	int i;
-	uint16_t idx;
+	uint64_t idx;
+	char buf[8];
 	struct sock_conn *conn;
 
 	idx = (attr->ep_type == FI_EP_MSG) ? index : index & attr->av->mask;
 
 	conn = ofi_idm_lookup(&attr->av_idm, idx);
 	if (conn && conn != SOCK_CM_CONN_IN_PROGRESS) {
-		if (conn->av_index == FI_ADDR_NOTAVAIL)
+		/* Verify that the existing connection is still usable, and
+		 * that the peer didn't restart.
+		 */
+		if (conn->connected == 0 ||
+		    (sock_comm_peek(conn, buf, 8) == 0 && conn->connected == 0)) {
+			sock_ep_remove_conn(attr, conn);
+			ofi_straddr_log(&sock_prov, FI_LOG_WARN, FI_LOG_EP_CTRL,
+					"Peer disconnected", &addr->sa);
+			return NULL;
+		}
+		if (conn->av_index != FI_ADDR_NOTAVAIL)
+			assert(conn->av_index == idx);
+		else
 			conn->av_index = idx;
 		return conn;
 	}
@@ -1812,11 +1827,22 @@ struct sock_conn *sock_ep_lookup_conn(struct sock_ep_attr *attr, fi_addr_t index
 
 		if (ofi_equals_sockaddr(&attr->cmap.table[i].addr.sa, &addr->sa)) {
 			conn = &attr->cmap.table[i];
-			if (conn->av_index == FI_ADDR_NOTAVAIL)
-				conn->av_index = idx;
 			break;
 		}
 	}
+	if (conn && conn != SOCK_CM_CONN_IN_PROGRESS) {
+		if (conn->connected == 0 ||
+		    (sock_comm_peek(conn, buf, 8) == 0 && conn->connected == 0)) {
+			sock_ep_remove_conn(attr, conn);
+			ofi_straddr_log(&sock_prov, FI_LOG_WARN, FI_LOG_EP_CTRL,
+					"Peer disconnected", &addr->sa);
+			return NULL;
+		}
+		if (conn->av_index != FI_ADDR_NOTAVAIL)
+			assert(conn->av_index == idx);
+		else
+			conn->av_index = idx;
+	}
 	return conn;
 }
 
@@ -1850,9 +1876,11 @@ int sock_ep_get_conn(struct sock_ep_attr *attr, struct sock_tx_ctx *tx_ctx,
 		ret = sock_ep_connect(attr, av_index, &conn);
 
 	if (!conn) {
-		SOCK_LOG_ERROR("Undable to find connection entry. "
+		SOCK_LOG_ERROR("Unable to find connection entry. "
 			       "Error in connecting: %s\n",
 			       fi_strerror(-ret));
+		ofi_straddr_log(&sock_prov, FI_LOG_WARN, FI_LOG_EP_CTRL,
+				"Unable to connect to", &addr->sa);
 		return -FI_ENOENT;
 	}
 
diff --git a/deps/libfabric/prov/sockets/src/sock_ep_msg.c b/deps/libfabric/prov/sockets/src/sock_ep_msg.c
index 594650c9bbf12bdd059a7b83fa7698567e2ffb70..50498685c43aa723959113f65d43a1d946ca5021 100644
--- a/deps/libfabric/prov/sockets/src/sock_ep_msg.c
+++ b/deps/libfabric/prov/sockets/src/sock_ep_msg.c
@@ -223,13 +223,13 @@ static void sock_ep_cm_monitor_handle(struct sock_ep_cm_head *cm_head,
 {
 	int ret;
 
-	fastlock_acquire(&cm_head->signal_lock);
+	pthread_mutex_lock(&cm_head->signal_lock);
 	if (handle->monitored)
 		goto unlock;
 
 	/* Mark the handle as monitored before adding it to the pollset */
 	handle->monitored = 1;
-	ret = ofi_epoll_add(cm_head->emap, handle->sock_fd,
+	ret = ofi_epoll_add(cm_head->epollfd, handle->sock_fd,
 	                   events, handle);
 	if (ret) {
 		SOCK_LOG_ERROR("failed to monitor fd %d: %d\n",
@@ -239,7 +239,7 @@ static void sock_ep_cm_monitor_handle(struct sock_ep_cm_head *cm_head,
 		fd_signal_set(&cm_head->signal);
 	}
 unlock:
-	fastlock_release(&cm_head->signal_lock);
+	pthread_mutex_unlock(&cm_head->signal_lock);
 }
 
 static void
@@ -250,11 +250,12 @@ sock_ep_cm_unmonitor_handle_locked(struct sock_ep_cm_head *cm_head,
 	int ret;
 
 	if (handle->monitored) {
-		ret = ofi_epoll_del(cm_head->emap, handle->sock_fd);
+		ret = ofi_epoll_del(cm_head->epollfd, handle->sock_fd);
 		if (ret)
 			SOCK_LOG_ERROR("failed to unmonitor fd %d: %d\n",
 			               handle->sock_fd, ret);
 		handle->monitored = 0;
+		cm_head->removed_from_epollfd = true;
 	}
 
 	/* Multiple threads might call sock_ep_cm_unmonitor_handle() at the
@@ -271,9 +272,9 @@ static void sock_ep_cm_unmonitor_handle(struct sock_ep_cm_head *cm_head,
                                        struct sock_conn_req_handle *handle,
                                        int close_socket)
 {
-	fastlock_acquire(&cm_head->signal_lock);
+	pthread_mutex_lock(&cm_head->signal_lock);
 	sock_ep_cm_unmonitor_handle_locked(cm_head, handle, close_socket);
-	fastlock_release(&cm_head->signal_lock);
+	pthread_mutex_unlock(&cm_head->signal_lock);
 }
 
 static void sock_ep_cm_shutdown_report(struct sock_ep *ep, int send_shutdown)
@@ -728,9 +729,9 @@ static struct fi_info *sock_ep_msg_get_info(struct sock_pep *pep,
 
 void sock_ep_cm_signal(struct sock_ep_cm_head *cm_head)
 {
-	fastlock_acquire(&cm_head->signal_lock);
+	pthread_mutex_lock(&cm_head->signal_lock);
 	fd_signal_set(&cm_head->signal);
-	fastlock_release(&cm_head->signal_lock);
+	pthread_mutex_unlock(&cm_head->signal_lock);
 }
 
 static void sock_ep_cm_process_rejected(struct sock_ep_cm_head *cm_head,
@@ -783,13 +784,13 @@ sock_ep_cm_pop_from_msg_list(struct sock_ep_cm_head *cm_head)
 	struct dlist_entry *entry;
 	struct sock_conn_req_handle *hreq = NULL;
 
-	fastlock_acquire(&cm_head->signal_lock);
+	pthread_mutex_lock(&cm_head->signal_lock);
 	if (!dlist_empty(&cm_head->msg_list)) {
 		entry = cm_head->msg_list.next;
 		dlist_remove(entry);
 		hreq = container_of(entry, struct sock_conn_req_handle, entry);
 	}
-	fastlock_release(&cm_head->signal_lock);
+	pthread_mutex_unlock(&cm_head->signal_lock);
 	return hreq;
 }
 
@@ -1001,9 +1002,9 @@ static int sock_pep_reject(struct fid_pep *pep, fid_t handle,
 
 	cm_head = &_pep->cm_head;
 	hreq->state = SOCK_CONN_HANDLE_REJECTED;
-	fastlock_acquire(&cm_head->signal_lock);
+	pthread_mutex_lock(&cm_head->signal_lock);
 	sock_ep_cm_add_to_msg_list(cm_head, hreq);
-	fastlock_release(&cm_head->signal_lock);
+	pthread_mutex_unlock(&cm_head->signal_lock);
 	return 0;
 }
 
@@ -1166,14 +1167,23 @@ static void *sock_ep_cm_thread(void *arg)
 	while (cm_head->do_listen) {
 		sock_ep_cm_check_closing_rejected_list(cm_head);
 
-		num_fds = ofi_epoll_wait(cm_head->emap, ep_contexts,
+		num_fds = ofi_epoll_wait(cm_head->epollfd, ep_contexts,
 		                        SOCK_EPOLL_WAIT_EVENTS, -1);
 		if (num_fds < 0) {
 			SOCK_LOG_ERROR("poll failed : %s\n", strerror(errno));
 			continue;
 		}
 
-		fastlock_acquire(&cm_head->signal_lock);
+		pthread_mutex_lock(&cm_head->signal_lock);
+		if (cm_head->removed_from_epollfd) {
+			/* If we removed a socket from the epollfd after
+			 * ofi_epoll_wait returned, we can hit a use after
+			 * free error.  If a change was made, we skip processing
+			 * and recheck for events.
+			 */
+			cm_head->removed_from_epollfd = false;
+			goto skip;
+		}
 		for (i = 0; i < num_fds; i++) {
 			handle = ep_contexts[i];
 
@@ -1195,7 +1205,8 @@ static void *sock_ep_cm_thread(void *arg)
 			assert(handle->sock_fd != INVALID_SOCKET);
 			sock_ep_cm_handle_rx(cm_head, handle);
 		}
-		fastlock_release(&cm_head->signal_lock);
+skip:
+		pthread_mutex_unlock(&cm_head->signal_lock);
 	}
 	return NULL;
 }
@@ -1205,10 +1216,10 @@ int sock_ep_cm_start_thread(struct sock_ep_cm_head *cm_head)
 {
 	assert(cm_head->do_listen == 0);
 
-	fastlock_init(&cm_head->signal_lock);
+	pthread_mutex_init(&cm_head->signal_lock, NULL);
 	dlist_init(&cm_head->msg_list);
 
-	int ret = ofi_epoll_create(&cm_head->emap);
+	int ret = ofi_epoll_create(&cm_head->epollfd);
 	if (ret < 0) {
 		SOCK_LOG_ERROR("failed to create epoll set\n");
 		goto err1;
@@ -1221,7 +1232,7 @@ int sock_ep_cm_start_thread(struct sock_ep_cm_head *cm_head)
 		goto err2;
 	}
 
-	ret = ofi_epoll_add(cm_head->emap,
+	ret = ofi_epoll_add(cm_head->epollfd,
 	                   cm_head->signal.fd[FI_READ_FD],
 	                   OFI_EPOLL_IN, NULL);
 	if (ret != 0){
@@ -1230,6 +1241,7 @@ int sock_ep_cm_start_thread(struct sock_ep_cm_head *cm_head)
 	}
 
 	cm_head->do_listen = 1;
+	cm_head->removed_from_epollfd = false;
 	ret = pthread_create(&cm_head->listener_thread, 0,
 	                     sock_ep_cm_thread, cm_head);
 	if (ret) {
@@ -1242,7 +1254,7 @@ err3:
 	cm_head->do_listen = 0;
 	fd_signal_free(&cm_head->signal);
 err2:
-	ofi_epoll_close(cm_head->emap);
+	ofi_epoll_close(cm_head->epollfd);
 err1:
 	return ret;
 }
@@ -1251,9 +1263,9 @@ void sock_ep_cm_wait_handle_finalized(struct sock_ep_cm_head *cm_head,
                                       struct sock_conn_req_handle *handle)
 {
 	handle->state = SOCK_CONN_HANDLE_FINALIZING;
-	fastlock_acquire(&cm_head->signal_lock);
+	pthread_mutex_lock(&cm_head->signal_lock);
 	sock_ep_cm_add_to_msg_list(cm_head, handle);
-	fastlock_release(&cm_head->signal_lock);
+	pthread_mutex_unlock(&cm_head->signal_lock);
 
 	pthread_mutex_lock(&handle->finalized_mutex);
 	while (handle->state != SOCK_CONN_HANDLE_FINALIZED)
@@ -1272,10 +1284,10 @@ void sock_ep_cm_stop_thread(struct sock_ep_cm_head *cm_head)
 	sock_ep_cm_signal(cm_head);
 
 	if (cm_head->listener_thread &&
-			pthread_join(cm_head->listener_thread, NULL)) {
+	    pthread_join(cm_head->listener_thread, NULL)) {
 		SOCK_LOG_DBG("pthread join failed\n");
 	}
-	ofi_epoll_close(cm_head->emap);
+	ofi_epoll_close(cm_head->epollfd);
 	fd_signal_free(&cm_head->signal);
-	fastlock_destroy(&cm_head->signal_lock);
+	pthread_mutex_destroy(&cm_head->signal_lock);
 }
diff --git a/deps/libfabric/prov/sockets/src/sock_mr.c b/deps/libfabric/prov/sockets/src/sock_mr.c
index b033f7d54c0b3065e19de7908bb72ad0a49a0961..11d774669621e7ffdd5bd185c798eb849c71e945 100644
--- a/deps/libfabric/prov/sockets/src/sock_mr.c
+++ b/deps/libfabric/prov/sockets/src/sock_mr.c
@@ -133,6 +133,7 @@ static int sock_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 {
 	struct fi_eq_entry eq_entry;
 	struct sock_domain *dom;
+	struct fi_mr_attr cur_abi_attr;
 	struct sock_mr *_mr;
 	uint64_t key;
 	struct fid_domain *domain;
@@ -149,6 +150,8 @@ static int sock_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 	if (!_mr)
 		return -FI_ENOMEM;
 
+	ofi_mr_update_attr(dom->fab->fab_fid.api_version, dom->info.caps,
+			   attr, &cur_abi_attr);
 	fastlock_acquire(&dom->lock);
 
 	_mr->mr_fid.fid.fclass = FI_CLASS_MR;
@@ -158,12 +161,12 @@ static int sock_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 	_mr->domain = dom;
 	_mr->flags = flags;
 
-	ret = ofi_mr_map_insert(&dom->mr_map, attr, &key, _mr);
+	ret = ofi_mr_map_insert(&dom->mr_map, &cur_abi_attr, &key, _mr);
 	if (ret != 0)
 		goto err;
 
 	_mr->mr_fid.key = _mr->key = key;
-	_mr->mr_fid.mem_desc = (void *)(uintptr_t)key;
+	_mr->mr_fid.mem_desc = (void *) (uintptr_t) key;
 	fastlock_release(&dom->lock);
 
 	*mr = &_mr->mr_fid;
diff --git a/deps/libfabric/prov/sockets/src/sock_msg.c b/deps/libfabric/prov/sockets/src/sock_msg.c
index 0d09300b8bb0a3d2d038512a42d172f169560ac2..a0e44e6e1d604598585f8d833ac00d7aa257e040 100644
--- a/deps/libfabric/prov/sockets/src/sock_msg.c
+++ b/deps/libfabric/prov/sockets/src/sock_msg.c
@@ -135,6 +135,7 @@ ssize_t sock_ep_recvmsg(struct fid_ep *ep, const struct fi_msg *msg,
 	SOCK_LOG_DBG("New rx_entry: %p (ctx: %p)\n", rx_entry, rx_ctx);
 	fastlock_acquire(&rx_ctx->lock);
 	dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_entry_list);
+	rx_ctx->progress_start = &rx_ctx->rx_buffered_list;
 	fastlock_release(&rx_ctx->lock);
 	return 0;
 }
@@ -479,6 +480,7 @@ ssize_t sock_ep_trecvmsg(struct fid_ep *ep,
 	fastlock_acquire(&rx_ctx->lock);
 	SOCK_LOG_DBG("New rx_entry: %p (ctx: %p)\n", rx_entry, rx_ctx);
 	dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_entry_list);
+	rx_ctx->progress_start = &rx_ctx->rx_buffered_list;
 	fastlock_release(&rx_ctx->lock);
 	return 0;
 }
diff --git a/deps/libfabric/prov/sockets/src/sock_poll.c b/deps/libfabric/prov/sockets/src/sock_poll.c
index e856774a173e2dc6df519cb20dc214fd63e75c12..6493b143561650aa5428551ba0ab6fd61bec36a4 100644
--- a/deps/libfabric/prov/sockets/src/sock_poll.c
+++ b/deps/libfabric/prov/sockets/src/sock_poll.c
@@ -129,12 +129,12 @@ static int sock_poll_poll(struct fid_poll *pollset, void **context, int count)
 			cq = container_of(list_item->fid, struct sock_cq,
 						cq_fid);
 			sock_cq_progress(cq);
-			fastlock_acquire(&cq->lock);
+			pthread_mutex_lock(&cq->lock);
 			if (ofi_rbfdused(&cq->cq_rbfd) || ofi_rbused(&cq->cqerr_rb)) {
 				*context++ = cq->cq_fid.fid.context;
 				ret_count++;
 			}
-			fastlock_release(&cq->lock);
+			pthread_mutex_unlock(&cq->lock);
 			break;
 
 		case FI_CLASS_CNTR:
diff --git a/deps/libfabric/prov/sockets/src/sock_progress.c b/deps/libfabric/prov/sockets/src/sock_progress.c
index 6cafe28ce08bc40a984009fd82ef7a86f941ec41..b8f21962fbbf339ca8babd65d64f0e2b2c8dd23a 100644
--- a/deps/libfabric/prov/sockets/src/sock_progress.c
+++ b/deps/libfabric/prov/sockets/src/sock_progress.c
@@ -68,8 +68,9 @@
 		(((uint64_t)_addr) >> (64 - _bits)))
 
 
-static int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx);
-
+#define SOCK_EP_MAX_PROGRESS_CNT 10
+static int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx,
+					bool shallow);
 
 static inline int sock_pe_is_data_msg(int msg_id)
 {
@@ -864,16 +865,15 @@ static void sock_pe_do_atomic(void *cmp, void *dst, void *src,
 {
 	char tmp_result[SOCK_EP_MAX_ATOMIC_SZ];
 
-	if (op >= OFI_SWAP_OP_START) {
-		ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][datatype](dst,
-			src, cmp, tmp_result, cnt);
+	if (ofi_atomic_isswap_op(op)) {
+		ofi_atomic_swap_handler(op, datatype, dst, src, cmp,
+					tmp_result, cnt);
                 if (cmp != NULL)
 			memcpy(cmp, tmp_result, ofi_datatype_size(datatype) * cnt);
-	} else if (fetch) {
-		ofi_atomic_readwrite_handlers[op][datatype](dst, src,
-			cmp /*results*/, cnt);
-	} else {
-		ofi_atomic_write_handlers[op][datatype](dst, src, cnt);
+	} else if (fetch && ofi_atomic_isreadwrite_op(op)) {
+		ofi_atomic_readwrite_handler(op, datatype, dst, src, cmp, cnt);
+	} else if (ofi_atomic_iswrite_op(op)) {
+		ofi_atomic_write_handler(op, datatype, dst, src, cnt);
 	}
 }
 
@@ -1059,7 +1059,7 @@ sock_pe_process_rx_tatomic(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx,
 
 	pe_entry->pe.rx.rx_entry = rx_entry;
 
-	sock_pe_progress_buffered_rx(rx_ctx);
+	sock_pe_progress_buffered_rx(rx_ctx, true);
 	fastlock_release(&rx_ctx->lock);
 
 	pe_entry->is_complete = 1;
@@ -1177,21 +1177,36 @@ ssize_t sock_rx_claim_recv(struct sock_rx_ctx *rx_ctx, void *context,
 	return ret;
 }
 
-static int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx)
+/* Check buffered msg list against posted list. If shallow is true,
+ * we only check SOCK_EP_MAX_PROGRESS_CNT messages to prevent progress
+ * test taking too long */
+static int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx,
+					bool shallow)
 {
 	struct dlist_entry *entry;
 	struct sock_pe_entry pe_entry;
 	struct sock_rx_entry *rx_buffered, *rx_posted;
 	size_t i, rem = 0, offset, len, used_len, dst_offset, datatype_sz;
+	size_t max_cnt;
 	char *src, *dst;
 
 	if (dlist_empty(&rx_ctx->rx_entry_list) ||
 	    dlist_empty(&rx_ctx->rx_buffered_list))
 		return 0;
 
-	for (entry = rx_ctx->rx_buffered_list.next;
-	     entry != &rx_ctx->rx_buffered_list;) {
-
+	if (!shallow) {
+		/* ignoring rx_ctx->progress_start */
+		entry = rx_ctx->rx_buffered_list.next;
+		max_cnt = SIZE_MAX;
+	} else {
+		/* continue where last time left off */
+		entry = rx_ctx->progress_start;
+		if (entry == &rx_ctx->rx_buffered_list) {
+			entry = entry->next;
+		}
+		max_cnt = SOCK_EP_MAX_PROGRESS_CNT;
+	}
+	for (i = 0; i < max_cnt && entry != &rx_ctx->rx_buffered_list; i++) {
 		rx_buffered = container_of(entry, struct sock_rx_entry, entry);
 		entry = entry->next;
 
@@ -1294,6 +1309,8 @@ static int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx)
 			rx_ctx->num_left++;
 		}
 	}
+	/* remember where we left off for next shallow progress */
+	rx_ctx->progress_start = entry;
 	return 0;
 }
 
@@ -1308,6 +1325,10 @@ static int sock_pe_process_rx_send(struct sock_pe *pe,
 	offset = 0;
 	len = sizeof(struct sock_msg_hdr);
 
+	if (pe_entry->addr == FI_ADDR_NOTAVAIL &&
+	    pe_entry->ep_attr->ep_type == FI_EP_RDM && pe_entry->ep_attr->av)
+		pe_entry->addr = pe_entry->conn->av_index;
+
 	if (pe_entry->msg_hdr.op_type == SOCK_OP_TSEND) {
 		if (sock_pe_recv_field(pe_entry, &pe_entry->tag,
 				       SOCK_TAG_SIZE, len))
@@ -1325,7 +1346,8 @@ static int sock_pe_process_rx_send(struct sock_pe *pe,
 	data_len = pe_entry->msg_hdr.msg_len - len;
 	if (pe_entry->done_len == len && !pe_entry->pe.rx.rx_entry) {
 		fastlock_acquire(&rx_ctx->lock);
-		sock_pe_progress_buffered_rx(rx_ctx);
+		rx_ctx->progress_start = &rx_ctx->rx_buffered_list;
+		sock_pe_progress_buffered_rx(rx_ctx, false);
 
 		rx_entry = sock_rx_get_entry(rx_ctx, pe_entry->addr, pe_entry->tag,
 					     pe_entry->msg_hdr.op_type == SOCK_OP_TSEND ? 1 : 0);
@@ -1923,13 +1945,12 @@ static int sock_pe_progress_tx_entry(struct sock_pe *pe,
 		goto out;
 
 	if (sock_comm_is_disconnected(pe_entry)) {
-		SOCK_LOG_DBG("conn disconnected: removing fd from pollset\n");
-		if (pe_entry->ep_attr->cmap.used > 0 &&
-		     pe_entry->conn->sock_fd != -1) {
-			fastlock_acquire(&pe_entry->ep_attr->cmap.lock);
-			sock_ep_remove_conn(pe_entry->ep_attr, pe_entry->conn);
-			fastlock_release(&pe_entry->ep_attr->cmap.lock);
-		}
+		ofi_straddr_log(&sock_prov, FI_LOG_WARN, FI_LOG_EP_DATA,
+				"Peer disconnected: removing fd from pollset",
+				&pe_entry->conn->addr.sa);
+		fastlock_acquire(&pe_entry->ep_attr->cmap.lock);
+		sock_ep_remove_conn(pe_entry->ep_attr, pe_entry->conn);
+		fastlock_release(&pe_entry->ep_attr->cmap.lock);
 
 		sock_pe_report_tx_error(pe_entry, 0, FI_EIO);
 		pe_entry->is_complete = 1;
@@ -2002,13 +2023,12 @@ static int sock_pe_progress_rx_pe_entry(struct sock_pe *pe,
 	int ret;
 
 	if (sock_comm_is_disconnected(pe_entry)) {
-		SOCK_LOG_DBG("conn disconnected: removing fd from pollset\n");
-		if (pe_entry->ep_attr->cmap.used > 0 &&
-		     pe_entry->conn->sock_fd != -1) {
-			fastlock_acquire(&pe_entry->ep_attr->cmap.lock);
-			sock_ep_remove_conn(pe_entry->ep_attr, pe_entry->conn);
-			fastlock_release(&pe_entry->ep_attr->cmap.lock);
-		}
+		ofi_straddr_log(&sock_prov, FI_LOG_WARN, FI_LOG_EP_DATA,
+				"Peer disconnected: removing fd from pollset",
+				&pe_entry->conn->addr.sa);
+		fastlock_acquire(&pe_entry->ep_attr->cmap.lock);
+		sock_ep_remove_conn(pe_entry->ep_attr, pe_entry->conn);
+		fastlock_release(&pe_entry->ep_attr->cmap.lock);
 
 		if (pe_entry->pe.rx.header_read)
 			sock_pe_report_rx_error(pe_entry, 0, FI_EIO);
@@ -2400,7 +2420,7 @@ int sock_pe_progress_rx_ctx(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx)
 	fastlock_acquire(&pe->lock);
 
 	fastlock_acquire(&rx_ctx->lock);
-	sock_pe_progress_buffered_rx(rx_ctx);
+	sock_pe_progress_buffered_rx(rx_ctx, true);
 	fastlock_release(&rx_ctx->lock);
 
 	/* check for incoming data */
@@ -2697,7 +2717,7 @@ struct sock_pe *sock_pe_init(struct sock_domain *domain)
 	pthread_mutex_init(&pe->list_lock, NULL);
 	pe->domain = domain;
 
-	
+
 	ret = ofi_bufpool_create(&pe->pe_rx_pool,
 				 sizeof(struct sock_pe_entry), 16, 0, 1024, 0);
 	if (ret) {
diff --git a/deps/libfabric/prov/sockets/src/sock_rx_entry.c b/deps/libfabric/prov/sockets/src/sock_rx_entry.c
index 7ff4b5576b71f58d71bab57c5825ea9e30c01e30..8f3e082df6afe6ab99bd2c6f5b1496166b64343e 100644
--- a/deps/libfabric/prov/sockets/src/sock_rx_entry.c
+++ b/deps/libfabric/prov/sockets/src/sock_rx_entry.c
@@ -124,6 +124,7 @@ struct sock_rx_entry *sock_rx_new_buffered_entry(struct sock_rx_ctx *rx_ctx,
 
 	rx_ctx->buffered_len += len;
 	dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_buffered_list);
+	rx_ctx->progress_start = &rx_ctx->rx_buffered_list;
 
 	return rx_entry;
 }
diff --git a/deps/libfabric/prov/tcp/src/tcpx.h b/deps/libfabric/prov/tcp/src/tcpx.h
index 0a5c59c7a2fac43b53a7794a7629eb0a0eaee788..7de2a63be5647781ce3eeb8d08ebb64d65a99a75 100644
--- a/deps/libfabric/prov/tcp/src/tcpx.h
+++ b/deps/libfabric/prov/tcp/src/tcpx.h
@@ -70,7 +70,6 @@
 #define TCPX_MAX_INJECT_SZ	(64)
 
 #define MAX_POLL_EVENTS		100
-#define STAGE_BUF_SIZE		512
 
 #define TCPX_MIN_MULTI_RECV	16384
 
@@ -101,6 +100,7 @@ enum tcpx_cm_event_type {
 	SERVER_RECV_CONNREQ,
 	SERVER_SEND_CM_ACCEPT,
 	CLIENT_RECV_CONNRESP,
+	CLIENT_SERVER_ERROR,
 };
 
 struct tcpx_cm_context {
@@ -129,12 +129,13 @@ struct tcpx_pep {
 	struct tcpx_cm_context	cm_ctx;
 };
 
-enum tcpx_cm_state {
-	TCPX_EP_CONNECTING,
-	TCPX_EP_CONNECTED,
-	TCPX_EP_SHUTDOWN,
-	TCPX_EP_POLL_REMOVED,
-	TCPX_EP_ERROR,
+enum tcpx_state {
+	TCPX_IDLE,
+	TCPX_CONNECTING,
+	TCPX_RCVD_REQ,
+	TCPX_ACCEPTING,
+	TCPX_CONNECTED,
+	TCPX_DISCONNECTED,
 };
 
 struct tcpx_base_hdr {
@@ -177,13 +178,15 @@ struct tcpx_rx_ctx {
 };
 
 typedef int (*tcpx_rx_process_fn_t)(struct tcpx_xfer_entry *rx_entry);
-typedef int (*tcpx_get_rx_func_t)(struct tcpx_ep *ep);
+
+enum {
+	STAGE_BUF_SIZE = 512
+};
 
 struct stage_buf {
 	uint8_t			buf[STAGE_BUF_SIZE];
-	size_t			size;
-	size_t			len;
-	size_t			off;
+	size_t			bytes_avail;
+	size_t			cur_pos;
 };
 
 struct tcpx_ep {
@@ -198,10 +201,10 @@ struct tcpx_ep {
 	struct slist		tx_rsp_pend_queue;
 	struct slist		rma_read_queue;
 	struct tcpx_rx_ctx	*srx_ctx;
-	enum tcpx_cm_state	cm_state;
-	/* lock for protecting tx/rx queues,rma list,cm_state*/
+	enum tcpx_state		state;
+	/* lock for protecting tx/rx queues, rma list, state*/
 	fastlock_t		lock;
-	tcpx_get_rx_func_t	get_rx_entry[ofi_op_write + 1];
+	int (*start_op[ofi_op_write + 1])(struct tcpx_ep *ep);
 	void (*hdr_bswap)(struct tcpx_base_hdr *hdr);
 	struct stage_buf	stage_buf;
 	size_t			min_multi_recv_size;
@@ -267,6 +270,7 @@ int tcpx_domain_open(struct fid_fabric *fabric, struct fi_info *info,
 
 int tcpx_endpoint(struct fid_domain *domain, struct fi_info *info,
 		  struct fid_ep **ep_fid, void *context);
+void tcpx_ep_disable(struct tcpx_ep *ep, int cm_err);
 
 
 int tcpx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
@@ -278,16 +282,15 @@ void tcpx_cq_report_error(struct util_cq *cq,
 			  int err);
 
 
+ssize_t tcpx_recv_hdr(SOCKET sock, struct stage_buf *stage_buf,
+		      struct tcpx_cur_rx_msg *cur_rx_msg);
 int tcpx_recv_msg_data(struct tcpx_xfer_entry *recv_entry);
 int tcpx_send_msg(struct tcpx_xfer_entry *tx_entry);
-int tcpx_comm_recv_hdr(SOCKET sock, struct stage_buf *sbuf,
-		        struct tcpx_cur_rx_msg *cur_rx_msg);
 int tcpx_read_to_buffer(SOCKET sock, struct stage_buf *stage_buf);
 
 struct tcpx_xfer_entry *tcpx_xfer_entry_alloc(struct tcpx_cq *cq,
 					      enum tcpx_xfer_op_codes type);
 
-void tcpx_ep_wait_fd_del(struct tcpx_ep *ep);
 void tcpx_xfer_entry_release(struct tcpx_cq *tcpx_cq,
 			     struct tcpx_xfer_entry *xfer_entry);
 void tcpx_srx_xfer_release(struct tcpx_rx_ctx *srx_ctx,
@@ -305,7 +308,6 @@ int tcpx_try_func(void *util_ep);
 void tcpx_hdr_none(struct tcpx_base_hdr *hdr);
 void tcpx_hdr_bswap(struct tcpx_base_hdr *hdr);
 
-int tcpx_ep_shutdown_report(struct tcpx_ep *ep, fid_t fid);
 void tcpx_tx_queue_insert(struct tcpx_ep *tcpx_ep,
 			  struct tcpx_xfer_entry *tx_entry);
 
@@ -314,10 +316,10 @@ int tcpx_eq_wait_try_func(void *arg);
 int tcpx_eq_create(struct fid_fabric *fabric_fid, struct fi_eq_attr *attr,
 		   struct fid_eq **eq_fid, void *context);
 
-int tcpx_get_rx_entry_op_invalid(struct tcpx_ep *tcpx_ep);
-int tcpx_get_rx_entry_op_msg(struct tcpx_ep *tcpx_ep);
-int tcpx_get_rx_entry_op_read_req(struct tcpx_ep *tcpx_ep);
-int tcpx_get_rx_entry_op_write(struct tcpx_ep *tcpx_ep);
-int tcpx_get_rx_entry_op_read_rsp(struct tcpx_ep *tcpx_ep);
+int tcpx_op_invalid(struct tcpx_ep *tcpx_ep);
+int tcpx_op_msg(struct tcpx_ep *tcpx_ep);
+int tcpx_op_read_req(struct tcpx_ep *tcpx_ep);
+int tcpx_op_write(struct tcpx_ep *tcpx_ep);
+int tcpx_op_read_rsp(struct tcpx_ep *tcpx_ep);
 
 #endif //_TCP_H_
diff --git a/deps/libfabric/prov/tcp/src/tcpx_comm.c b/deps/libfabric/prov/tcp/src/tcpx_comm.c
index 43eef1405c9503f072c8fce30269b189edbe9811..d790b01379fd1cd864b3707368f3d90109211720 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_comm.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_comm.c
@@ -57,68 +57,51 @@ int tcpx_send_msg(struct tcpx_xfer_entry *tx_entry)
 	return FI_SUCCESS;
 }
 
-static ssize_t tcpx_read_from_buffer(struct stage_buf *sbuf,
+static ssize_t tcpx_read_from_buffer(struct stage_buf *stage_buf,
 				     uint8_t *buf, size_t len)
 {
 	size_t rem_size;
 	ssize_t ret;
 
-	assert(sbuf->len >= sbuf->off);
-	rem_size = sbuf->len - sbuf->off;
-	assert(rem_size);
+	assert(stage_buf->cur_pos < stage_buf->bytes_avail);
+	rem_size = stage_buf->bytes_avail - stage_buf->cur_pos;
 	ret = (rem_size >= len) ? len : rem_size;
-	memcpy(buf, &sbuf->buf[sbuf->off], ret);
-	sbuf->off += ret;
+	memcpy(buf, &stage_buf->buf[stage_buf->cur_pos], ret);
+	stage_buf->cur_pos += ret;
 	return ret;
 }
 
-static int tcpx_recv_hdr(SOCKET sock, struct stage_buf *sbuf,
-			  struct tcpx_cur_rx_msg *cur_rx_msg)
+ssize_t tcpx_recv_hdr(SOCKET sock, struct stage_buf *stage_buf,
+		      struct tcpx_cur_rx_msg *cur_rx_msg)
 {
-	void *rem_buf;
+	ssize_t bytes_recvd, bytes_read;
 	size_t rem_len;
-	ssize_t bytes_recvd;
+	void *rem_buf;
 
 	rem_buf = (uint8_t *) &cur_rx_msg->hdr + cur_rx_msg->done_len;
 	rem_len = cur_rx_msg->hdr_len - cur_rx_msg->done_len;
 
-	if (sbuf->len != sbuf->off)
-		bytes_recvd = tcpx_read_from_buffer(sbuf, rem_buf, rem_len);
-	else
-		bytes_recvd = ofi_recv_socket(sock, rem_buf, rem_len, 0);
-	if (bytes_recvd <= 0)
-		return bytes_recvd ? -ofi_sockerr(): -FI_ENOTCONN;
+	if (stage_buf->cur_pos < stage_buf->bytes_avail) {
+		bytes_read = tcpx_read_from_buffer(stage_buf, rem_buf, rem_len);
+		rem_len -= bytes_read;
+		if (!rem_len)
+			return bytes_read;
 
-	return bytes_recvd;
-}
+		rem_buf = (char *) rem_buf + bytes_read;
+	} else {
+		bytes_read = 0;
+	}
 
-int tcpx_comm_recv_hdr(SOCKET sock, struct stage_buf *sbuf,
-		        struct tcpx_cur_rx_msg *cur_rx_msg)
-{
-	ssize_t bytes_recvd;
-	bytes_recvd = tcpx_recv_hdr(sock, sbuf, cur_rx_msg);
+	bytes_recvd = ofi_recv_socket(sock, rem_buf, rem_len, 0);
 	if (bytes_recvd < 0)
-		return bytes_recvd;
-	cur_rx_msg->done_len += bytes_recvd;
-
-	if (cur_rx_msg->done_len == sizeof(cur_rx_msg->hdr.base_hdr)) {
-		cur_rx_msg->hdr_len = (size_t) cur_rx_msg->hdr.base_hdr.payload_off;
-
-		if (cur_rx_msg->hdr_len > cur_rx_msg->done_len) {
-			bytes_recvd = tcpx_recv_hdr(sock, sbuf, cur_rx_msg);
-			if (bytes_recvd < 0)
-				return bytes_recvd;
-			cur_rx_msg->done_len += bytes_recvd;
-			return (cur_rx_msg->done_len == cur_rx_msg->hdr_len) ?
-				FI_SUCCESS : -FI_EAGAIN;
-		}
-	}
+		return bytes_read ? bytes_read : -ofi_sockerr();
+	else if (bytes_recvd == 0)
+		return -FI_ENOTCONN;
 
-	return (cur_rx_msg->done_len == cur_rx_msg->hdr_len) ?
-		FI_SUCCESS : -FI_EAGAIN;
+	return bytes_read + bytes_recvd;
 }
 
-static ssize_t tcpx_readv_from_buffer(struct stage_buf *sbuf,
+static ssize_t tcpx_readv_from_buffer(struct stage_buf *stage_buf,
 				      struct iovec *iov,
 				      int iov_cnt)
 {
@@ -127,15 +110,15 @@ static ssize_t tcpx_readv_from_buffer(struct stage_buf *sbuf,
 	int i;
 
 	if (iov_cnt == 1)
-		return tcpx_read_from_buffer(sbuf, iov[0].iov_base,
+		return tcpx_read_from_buffer(stage_buf, iov[0].iov_base,
 					     iov[0].iov_len);
 
 	for (i = 0; i < iov_cnt; i++) {
-		bytes_read = tcpx_read_from_buffer(sbuf, iov[i].iov_base,
+		bytes_read = tcpx_read_from_buffer(stage_buf, iov[i].iov_base,
 						   iov[i].iov_len);
 		ret += bytes_read;
 		if ((bytes_read < iov[i].iov_len) ||
-		    !(sbuf->len - sbuf->off))
+		    !(stage_buf->bytes_avail - stage_buf->cur_pos))
 			break;
 	}
 	return ret;
@@ -143,25 +126,34 @@ static ssize_t tcpx_readv_from_buffer(struct stage_buf *sbuf,
 
 int tcpx_recv_msg_data(struct tcpx_xfer_entry *rx_entry)
 {
-	ssize_t bytes_recvd;
+	struct stage_buf *stage_buf;
+	ssize_t bytes_recvd, bytes_read;
 
 	if (!rx_entry->iov_cnt || !rx_entry->iov[0].iov_len)
 		return FI_SUCCESS;
 
-	if (rx_entry->ep->stage_buf.len != rx_entry->ep->stage_buf.off)
-		bytes_recvd = tcpx_readv_from_buffer(&rx_entry->ep->stage_buf,
-						     rx_entry->iov,
-						     rx_entry->iov_cnt);
-	else
-		bytes_recvd = ofi_readv_socket(rx_entry->ep->sock,
-					       rx_entry->iov,
-					       rx_entry->iov_cnt);
-	if (bytes_recvd <= 0)
-		return (bytes_recvd) ? -ofi_sockerr(): -FI_ENOTCONN;
+	stage_buf = &rx_entry->ep->stage_buf;
+	if (stage_buf->cur_pos < stage_buf->bytes_avail) {
+		bytes_read = tcpx_readv_from_buffer(stage_buf,
+						    rx_entry->iov,
+						    rx_entry->iov_cnt);
+		ofi_consume_iov(rx_entry->iov, &rx_entry->iov_cnt, bytes_read);
+		if (!rx_entry->iov_cnt || !rx_entry->iov[0].iov_len)
+			return FI_SUCCESS;
+	} else {
+		bytes_read = 0;
+	}
+
+	bytes_recvd = ofi_readv_socket(rx_entry->ep->sock, rx_entry->iov,
+				       rx_entry->iov_cnt);
+	if (bytes_recvd < 0)
+		return bytes_read ? -FI_EAGAIN : -ofi_sockerr();
+	else if (bytes_recvd == 0)
+		return -FI_ENOTCONN;
 
 	ofi_consume_iov(rx_entry->iov, &rx_entry->iov_cnt, bytes_recvd);
-	return (rx_entry->iov_cnt && rx_entry->iov[0].iov_len) ?
-		-FI_EAGAIN: FI_SUCCESS;
+	return (!rx_entry->iov_cnt || !rx_entry->iov[0].iov_len) ?
+		FI_SUCCESS : -FI_EAGAIN;
 }
 
 int tcpx_read_to_buffer(SOCKET sock, struct stage_buf *stage_buf)
@@ -169,11 +161,11 @@ int tcpx_read_to_buffer(SOCKET sock, struct stage_buf *stage_buf)
 	int bytes_recvd;
 
 	bytes_recvd = ofi_recv_socket(sock, stage_buf->buf,
-				      stage_buf->size, 0);
+				      sizeof(stage_buf->buf), 0);
 	if (bytes_recvd <= 0)
 		return (bytes_recvd) ? -ofi_sockerr(): -FI_ENOTCONN;
 
-	stage_buf->len = bytes_recvd;
-	stage_buf->off = 0;
+	stage_buf->bytes_avail = bytes_recvd;
+	stage_buf->cur_pos = 0;
 	return FI_SUCCESS;
 }
diff --git a/deps/libfabric/prov/tcp/src/tcpx_conn_mgr.c b/deps/libfabric/prov/tcp/src/tcpx_conn_mgr.c
index 286fd9e9fdc880d3c487c113a169e8826eba4ff0..de7f5b2296ccdcce8c1d8793d8aca7d2b57106a0 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_conn_mgr.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_conn_mgr.c
@@ -60,9 +60,10 @@ static int rx_cm_data(SOCKET fd, struct ofi_ctrl_hdr *hdr,
 		goto out;
 	}
 
-	if (hdr->type != type) {
+	if (hdr->type != type && hdr->type != ofi_ctrl_nack) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
-			"unexpected cm message type\n");
+			"unexpected cm message type, expected %d or %d got: %d\n",
+			type, ofi_ctrl_nack, hdr->type);
 		ret = -FI_ECONNREFUSED;
 		goto out;
 	}
@@ -86,6 +87,14 @@ static int rx_cm_data(SOCKET fd, struct ofi_ctrl_hdr *hdr,
 					   TCPX_MAX_CM_DATA_SIZE);
 		}
 	}
+
+	if (hdr->type == ofi_ctrl_nack) {
+		FI_INFO(&tcpx_prov, FI_LOG_EP_CTRL,
+			"Connection refused from remote\n");
+		ret = -FI_ECONNREFUSED;
+		goto out;
+	}
+
 	ret = 0;
 out:
 	cm_ctx->cm_data_sz = data_size;
@@ -119,12 +128,12 @@ err:
 	return ofi_sockerr() ? -ofi_sockerr() : -FI_EIO;
 }
 
-static int tcpx_ep_enable_xfers(struct tcpx_ep *ep)
+static int tcpx_ep_enable(struct tcpx_ep *ep)
 {
 	int ret;
 
 	fastlock_acquire(&ep->lock);
-	if (ep->cm_state != TCPX_EP_CONNECTING) {
+	if (ep->state != TCPX_CONNECTING && ep->state != TCPX_ACCEPTING) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
 			"ep is in invalid state\n");
 		ret = -FI_EINVAL;
@@ -137,7 +146,7 @@ static int tcpx_ep_enable_xfers(struct tcpx_ep *ep)
 			"failed to set socket to nonblocking\n");
 		goto unlock;
 	}
-	ep->cm_state = TCPX_EP_CONNECTED;
+	ep->state = TCPX_CONNECTED;
 	fastlock_release(&ep->lock);
 
 	if (ep->util_ep.rx_cq) {
@@ -145,6 +154,11 @@ static int tcpx_ep_enable_xfers(struct tcpx_ep *ep)
 				      ep->sock, POLLIN, tcpx_try_func,
 				      (void *) &ep->util_ep,
 				      &ep->util_ep.ep_fid.fid);
+		if (ret) {
+			FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+				"Failed to add fd to rx_cq\n");
+			return ret;
+		}
 	}
 
 	if (ep->util_ep.tx_cq) {
@@ -152,8 +166,15 @@ static int tcpx_ep_enable_xfers(struct tcpx_ep *ep)
 				      ep->sock, POLLIN, tcpx_try_func,
 				      (void *) &ep->util_ep,
 				      &ep->util_ep.ep_fid.fid);
+		if (ret) {
+			FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+				"Failed to add fd to tx_cq\n");
+			return ret;
+		}
 	}
 
+	/* TODO: Move writing CONNECTED event here */
+
 	return ret;
 unlock:
 	fastlock_release(&ep->lock);
@@ -170,7 +191,9 @@ static int proc_conn_resp(struct tcpx_cm_context *cm_ctx,
 
 	ret = rx_cm_data(ep->sock, &conn_resp, ofi_ctrl_connresp, cm_ctx);
 	if (ret) {
-		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+		enum fi_log_level level = (ret == -FI_ECONNREFUSED) ?
+				FI_LOG_INFO : FI_LOG_WARN;
+		FI_LOG(&tcpx_prov, level, FI_LOG_EP_CTRL,
 			"Failed to receive connect response\n");
 		return ret;
 	}
@@ -185,7 +208,7 @@ static int proc_conn_resp(struct tcpx_cm_context *cm_ctx,
 	ep->hdr_bswap = (conn_resp.conn_data == 1) ?
 			tcpx_hdr_none : tcpx_hdr_bswap;
 
-	ret = tcpx_ep_enable_xfers(ep);
+	ret = tcpx_ep_enable(ep);
 	if (ret)
 		goto err;
 
@@ -207,7 +230,6 @@ int tcpx_eq_wait_try_func(void *arg)
 static void client_recv_connresp(struct util_wait *wait,
 				 struct tcpx_cm_context *cm_ctx)
 {
-	struct fi_eq_err_entry err_entry;
 	struct tcpx_ep *ep;
 	ssize_t ret;
 
@@ -222,6 +244,7 @@ static void client_recv_connresp(struct util_wait *wait,
 		goto err;
 	}
 
+	/* TODO: merge proc_conn_resp into here */
 	ret = proc_conn_resp(cm_ctx, ep);
 	if (ret)
 		goto err;
@@ -229,42 +252,34 @@ static void client_recv_connresp(struct util_wait *wait,
 	free(cm_ctx);
 	return;
 err:
-	memset(&err_entry, 0, sizeof err_entry);
-	err_entry.fid = cm_ctx->fid;
-	err_entry.context = cm_ctx->fid->context;
-	err_entry.err = -ret;
-	if (cm_ctx->cm_data_sz) {
-		err_entry.err_data = calloc(1, cm_ctx->cm_data_sz);
-		if (err_entry.err_data) {
-			memcpy(err_entry.err_data, cm_ctx->cm_data,
-			       cm_ctx->cm_data_sz);
-			err_entry.err_data_size = cm_ctx->cm_data_sz;
-		}
-	}
+	tcpx_ep_disable(ep, -ret);
 	free(cm_ctx);
-
-	/* `err_entry.err_data` must live until it is passed to user */
-	ret = fi_eq_write(&ep->util_ep.eq->eq_fid, FI_SHUTDOWN,
-			  &err_entry, sizeof(err_entry), UTIL_FLAG_ERROR);
-	if (ret < 0)
-		free(err_entry.err_data);
 }
 
 static void server_send_cm_accept(struct util_wait *wait,
 				  struct tcpx_cm_context *cm_ctx)
 {
 	struct fi_eq_cm_entry cm_entry = {0};
-	struct fi_eq_err_entry err_entry;
 	struct tcpx_ep *ep;
 	int ret;
 
+	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Send connect (accept) response\n");
 	assert(cm_ctx->fid->fclass == FI_CLASS_EP);
 	ep = container_of(cm_ctx->fid, struct tcpx_ep, util_ep.ep_fid.fid);
 
-	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Send connect (accept) response\n");
+	ret = ofi_wait_del_fd(wait, ep->sock);
+	if (ret) {
+		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+			"Could not remove fd from wait\n");
+		goto err;
+	}
+
 	ret = tx_cm_data(ep->sock, ofi_ctrl_connresp, cm_ctx);
-	if (ret)
+	if (ret) {
+		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+			"Failed to send connect (accept) response\n");
 		goto err;
+	}
 
 	cm_entry.fid =  cm_ctx->fid;
 	ret = (int) fi_eq_write(&ep->util_ep.eq->eq_fid, FI_CONNECTED,
@@ -272,29 +287,17 @@ static void server_send_cm_accept(struct util_wait *wait,
 	if (ret < 0)
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "Error writing to EQ\n");
 
-	ret = ofi_wait_del_fd(wait, ep->sock);
-	if (ret) {
-		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
-			"Could not remove fd from wait\n");
-		goto err;
-	}
-
-	ret = tcpx_ep_enable_xfers(ep);
+	ret = tcpx_ep_enable(ep);
 	if (ret)
 		goto err;
 
 	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Connection Accept Successful\n");
 	free(cm_ctx);
 	return;
-err:
-	memset(&err_entry, 0, sizeof err_entry);
-	err_entry.fid = cm_ctx->fid;
-	err_entry.context = cm_ctx->fid->context;
-	err_entry.err = -ret;
 
+err:
+	tcpx_ep_disable(ep, -ret);
 	free(cm_ctx);
-	fi_eq_write(&ep->util_ep.eq->eq_fid, FI_SHUTDOWN,
-		    &err_entry, sizeof(err_entry), UTIL_FLAG_ERROR);
 }
 
 static void server_recv_connreq(struct util_wait *wait,
@@ -306,10 +309,17 @@ static void server_recv_connreq(struct util_wait *wait,
 	socklen_t len;
 	int ret;
 
-	assert(cm_ctx->fid->fclass == FI_CLASS_CONNREQ);
+	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Server receive connect request\n");
 	handle  = container_of(cm_ctx->fid, struct tcpx_conn_handle, handle);
 
-	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Server receive connect request\n");
+	ret = ofi_wait_del_fd(wait, handle->sock);
+	if (ret) {
+		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+			"fd deletion from ofi_wait failed\n");
+		cm_ctx->type = CLIENT_SERVER_ERROR;
+		return;
+	}
+
 	ret = rx_cm_data(handle->sock, &conn_req, ofi_ctrl_connreq, cm_ctx);
 	if (ret)
 		goto err1;
@@ -342,10 +352,7 @@ static void server_recv_connreq(struct util_wait *wait,
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "Error writing to EQ\n");
 		goto err3;
 	}
-	ret = ofi_wait_del_fd(wait, handle->sock);
-	if (ret)
-		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
-			"fd deletion from ofi_wait failed\n");
+
 	free(cm_entry);
 	free(cm_ctx);
 	return;
@@ -354,7 +361,6 @@ err3:
 err2:
 	free(cm_entry);
 err1:
-	ofi_wait_del_fd(wait, handle->sock);
 	ofi_close_socket(handle->sock);
 	free(cm_ctx);
 	free(handle);
@@ -364,20 +370,25 @@ static void client_send_connreq(struct util_wait *wait,
 				struct tcpx_cm_context *cm_ctx)
 {
 	struct tcpx_ep *ep;
-	struct fi_eq_err_entry err_entry;
 	socklen_t len;
 	int status, ret = FI_SUCCESS;
 
 	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "client send connreq\n");
-	assert(cm_ctx->fid->fclass == FI_CLASS_EP);
-
 	ep = container_of(cm_ctx->fid, struct tcpx_ep, util_ep.ep_fid.fid);
 
+	ret = ofi_wait_del_fd(wait, ep->sock);
+	if (ret) {
+		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
+			"Could not remove fd from wait: %s\n",
+			fi_strerror(-ret));
+		goto err;
+	}
+
 	len = sizeof(status);
 	ret = getsockopt(ep->sock, SOL_SOCKET, SO_ERROR, (char *) &status, &len);
 	if (ret < 0 || status) {
-		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "connection failure\n");
 		ret = (ret < 0)? -ofi_sockerr() : status;
+		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "connection failure\n");
 		goto err;
 	}
 
@@ -386,8 +397,6 @@ static void client_send_connreq(struct util_wait *wait,
 		goto err;
 
 	ret = ofi_wait_del_fd(wait, ep->sock);
-	if (ret)
-		goto err;
 
 	cm_ctx->type = CLIENT_RECV_CONNRESP;
 	ret = ofi_wait_add_fd(wait, ep->sock, POLLIN,
@@ -396,15 +405,10 @@ static void client_send_connreq(struct util_wait *wait,
 		goto err;
 
 	return;
-err:
-	memset(&err_entry, 0, sizeof err_entry);
-	err_entry.fid = cm_ctx->fid;
-	err_entry.context = cm_ctx->fid->context;
-	err_entry.err = -ret;
 
+err:
+	tcpx_ep_disable(ep, -ret);
 	free(cm_ctx);
-	fi_eq_write(&ep->util_ep.eq->eq_fid, FI_SHUTDOWN,
-		    &err_entry, sizeof(err_entry), UTIL_FLAG_ERROR);
 }
 
 static void server_sock_accept(struct util_wait *wait,
@@ -459,28 +463,46 @@ err1:
 	ofi_close_socket(sock);
 }
 
+/*
+ * The cm_context::fid is an endpoint, which contains the EP state.
+ * That state duplicates the cm_context::type.  Remove cm_ctx and use
+ * the fid fclass and EP state.
+ */
 static void process_cm_ctx(struct util_wait *wait,
 			   struct tcpx_cm_context *cm_ctx)
 {
 	switch (cm_ctx->type) {
 	case SERVER_SOCK_ACCEPT:
-		server_sock_accept(wait,cm_ctx);
+		assert(cm_ctx->fid->fclass == FI_CLASS_PEP);
+		server_sock_accept(wait, cm_ctx);
 		break;
 	case CLIENT_SEND_CONNREQ:
+		assert((cm_ctx->fid->fclass == FI_CLASS_EP) &&
+		       (container_of(cm_ctx->fid, struct tcpx_ep,
+				     util_ep.ep_fid.fid)->state ==
+							  TCPX_CONNECTING));
 		client_send_connreq(wait, cm_ctx);
 		break;
 	case SERVER_RECV_CONNREQ:
+		assert(cm_ctx->fid->fclass == FI_CLASS_CONNREQ);
 		server_recv_connreq(wait, cm_ctx);
 		break;
 	case SERVER_SEND_CM_ACCEPT:
+		assert((cm_ctx->fid->fclass == FI_CLASS_EP) &&
+		       (container_of(cm_ctx->fid, struct tcpx_ep,
+				     util_ep.ep_fid.fid)->state ==
+							  TCPX_ACCEPTING));
 		server_send_cm_accept(wait, cm_ctx);
 		break;
 	case CLIENT_RECV_CONNRESP:
+		assert((cm_ctx->fid->fclass == FI_CLASS_EP) &&
+		       (container_of(cm_ctx->fid, struct tcpx_ep,
+				     util_ep.ep_fid.fid)->state ==
+							  TCPX_CONNECTING));
 		client_recv_connresp(wait, cm_ctx);
 		break;
 	default:
-		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
-			"should never end up here\n");
+		break;
 	}
 }
 
diff --git a/deps/libfabric/prov/tcp/src/tcpx_cq.c b/deps/libfabric/prov/tcp/src/tcpx_cq.c
index a926b03047175e6dfac8aae53da1296dc6f0d7e9..c4ce02291b9f08c1a7e4e3be2345cf3b3bacae16 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_cq.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_cq.c
@@ -58,7 +58,7 @@ void tcpx_cq_progress(struct util_cq *cq)
 		tcpx_try_func(&ep->util_ep);
 		fastlock_acquire(&ep->lock);
 		tcpx_progress_tx(ep);
-		if (ep->stage_buf.off != ep->stage_buf.len)
+		if (ep->stage_buf.cur_pos < ep->stage_buf.bytes_avail)
 			tcpx_progress_rx(ep);
 		fastlock_release(&ep->lock);
 	}
diff --git a/deps/libfabric/prov/tcp/src/tcpx_ep.c b/deps/libfabric/prov/tcp/src/tcpx_ep.c
index 2f8755d94da558858e5777153704019666318d54..1076993dbb6737b445884667105163330599b82a 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_ep.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_ep.c
@@ -77,14 +77,14 @@ static int tcpx_setup_socket(SOCKET sock)
 			 sizeof(optval));
 	if (ret) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,"setsockopt reuseaddr failed\n");
-		return ret;
+		return -ofi_sockerr();
 	}
 
 	ret = setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (char *) &optval,
 			 sizeof(optval));
 	if (ret) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,"setsockopt nodelay failed\n");
-		return ret;
+		return -ofi_sockerr();
 	}
 
 	return ret;
@@ -97,7 +97,8 @@ static int tcpx_ep_connect(struct fid_ep *ep, const void *addr,
 	struct tcpx_cm_context *cm_ctx;
 	int ret;
 
-	if (!addr || !tcpx_ep->sock || paramlen > TCPX_MAX_CM_DATA_SIZE)
+	if (!addr || !tcpx_ep->sock || paramlen > TCPX_MAX_CM_DATA_SIZE ||
+	    tcpx_ep->state != TCPX_IDLE)
 		return -FI_EINVAL;
 
 	cm_ctx = calloc(1, sizeof(*cm_ctx));
@@ -107,11 +108,13 @@ static int tcpx_ep_connect(struct fid_ep *ep, const void *addr,
 		return -FI_ENOMEM;
 	}
 
+	tcpx_ep->state = TCPX_CONNECTING;
 	ret = connect(tcpx_ep->sock, (struct sockaddr *) addr,
 		      (socklen_t) ofi_sizeofaddr(addr));
 	if (ret && ofi_sockerr() != FI_EINPROGRESS) {
+		tcpx_ep->state = TCPX_IDLE;
 		ret =  -ofi_sockerr();
-		goto err;
+		goto free;
 	}
 
 	cm_ctx->fid = &tcpx_ep->util_ep.ep_fid.fid;
@@ -125,10 +128,13 @@ static int tcpx_ep_connect(struct fid_ep *ep, const void *addr,
 	ret = ofi_wait_add_fd(tcpx_ep->util_ep.eq->wait, tcpx_ep->sock,
 			      POLLOUT, tcpx_eq_wait_try_func, NULL,cm_ctx);
 	if (ret)
-		goto err;
+		goto disable;
 
 	return 0;
-err:
+
+disable:
+	tcpx_ep_disable(tcpx_ep, -ret);
+free:
 	free(cm_ctx);
 	return ret;
 }
@@ -139,7 +145,7 @@ static int tcpx_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen)
 	struct tcpx_cm_context *cm_ctx;
 	int ret;
 
-	if (tcpx_ep->sock == INVALID_SOCKET)
+	if (tcpx_ep->sock == INVALID_SOCKET || tcpx_ep->state != TCPX_RCVD_REQ)
 		return -FI_EINVAL;
 
 	cm_ctx = calloc(1, sizeof(*cm_ctx));
@@ -149,6 +155,7 @@ static int tcpx_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen)
 		return -FI_ENOMEM;
 	}
 
+	tcpx_ep->state = TCPX_ACCEPTING;
 	cm_ctx->fid = &tcpx_ep->util_ep.ep_fid.fid;
 	cm_ctx->type = SERVER_SEND_CM_ACCEPT;
 	if (paramlen) {
@@ -159,11 +166,83 @@ static int tcpx_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen)
 	ret = ofi_wait_add_fd(tcpx_ep->util_ep.eq->wait, tcpx_ep->sock,
 			      POLLOUT, tcpx_eq_wait_try_func, NULL, cm_ctx);
 	if (ret)
-		free(cm_ctx);
+		goto free;
+
+	return 0;
 
+free:
+	tcpx_ep->state = TCPX_RCVD_REQ;
+	free(cm_ctx);
 	return ret;
 }
 
+static void tcpx_ep_flush_pending_xfers(struct tcpx_ep *ep)
+{
+	struct slist_entry *entry;
+	struct tcpx_xfer_entry *tx_entry;
+	struct tcpx_cq *cq;
+
+	while (!slist_empty(&ep->tx_rsp_pend_queue)) {
+		entry = slist_remove_head(&ep->tx_rsp_pend_queue);
+		tx_entry = container_of(entry, struct tcpx_xfer_entry, entry);
+		tcpx_cq_report_error(ep->util_ep.tx_cq, tx_entry, FI_ENOTCONN);
+
+		cq = container_of(ep->util_ep.tx_cq, struct tcpx_cq, util_cq);
+		tcpx_xfer_entry_release(cq, tx_entry);
+	}
+}
+
+/* must hold ep->lock */
+void tcpx_ep_disable(struct tcpx_ep *ep, int cm_err)
+{
+	struct util_wait_fd *wait;
+	struct fi_eq_cm_entry cm_entry = {0};
+	struct fi_eq_err_entry err_entry = {0};
+
+	switch (ep->state) {
+	case TCPX_RCVD_REQ:
+		break;
+	case TCPX_CONNECTED:
+		if (ep->util_ep.tx_cq) {
+			wait = container_of(ep->util_ep.tx_cq->wait,
+					    struct util_wait_fd, util_wait);
+			ofi_wait_fdset_del(wait, ep->sock);
+		}
+
+		if (ep->util_ep.rx_cq) {
+			wait = container_of(ep->util_ep.rx_cq->wait,
+					    struct util_wait_fd, util_wait);
+			ofi_wait_fdset_del(wait, ep->sock);
+		}
+
+		tcpx_ep_flush_pending_xfers(ep);
+		/* fall through */
+	case TCPX_ACCEPTING:
+	case TCPX_CONNECTING:
+		wait = container_of(ep->util_ep.eq->wait,
+				    struct util_wait_fd, util_wait);
+		ofi_wait_fdset_del(wait, ep->sock);
+		break;
+
+	default:
+		return;
+	}
+
+	if (cm_err) {
+		err_entry.fid = &ep->util_ep.ep_fid.fid;
+		err_entry.context = ep->util_ep.ep_fid.fid.context;
+		err_entry.err = cm_err;
+		(void) fi_eq_write(&ep->util_ep.eq->eq_fid, FI_SHUTDOWN,
+				   &err_entry, sizeof(err_entry),
+				   UTIL_FLAG_ERROR);
+	} else {
+		cm_entry.fid = &ep->util_ep.ep_fid.fid;
+		(void) fi_eq_write(&ep->util_ep.eq->eq_fid, FI_SHUTDOWN,
+				   &cm_entry, sizeof(cm_entry), 0);
+	}
+	ep->state = TCPX_DISCONNECTED;
+}
+
 static int tcpx_ep_shutdown(struct fid_ep *ep, uint64_t flags)
 {
 	struct tcpx_ep *tcpx_ep;
@@ -177,20 +256,21 @@ static int tcpx_ep_shutdown(struct fid_ep *ep, uint64_t flags)
 	}
 
 	fastlock_acquire(&tcpx_ep->lock);
-	ret = tcpx_ep_shutdown_report(tcpx_ep, &ep->fid);
+	tcpx_ep_disable(tcpx_ep, 0);
 	fastlock_release(&tcpx_ep->lock);
-	if (ret)
-		FI_WARN(&tcpx_prov, FI_LOG_EP_DATA, "Error writing to EQ\n");
 
-	return ret;
+	return FI_SUCCESS;
 }
 
 static int tcpx_bind_to_port_range(SOCKET sock, void* src_addr, size_t addrlen)
 {
 	int ret, i, rand_port_number;
+	static uint32_t seed;
+	if (!seed)
+		seed = ofi_generate_seed();
 
-	rand_port_number = rand() % (port_range.high + 1 - port_range.low) +
-			   port_range.low;
+	rand_port_number = ofi_xorshift_random_r(&seed) %
+			   (port_range.high + 1 - port_range.low) + port_range.low;
 
 	for (i = port_range.low; i <= port_range.high; i++, rand_port_number++) {
 		if (rand_port_number > port_range.high)
@@ -199,13 +279,13 @@ static int tcpx_bind_to_port_range(SOCKET sock, void* src_addr, size_t addrlen)
 		ofi_addr_set_port(src_addr, rand_port_number);
 		ret = bind(sock, src_addr, (socklen_t) addrlen);
 		if (ret) {
-			if (errno == EADDRINUSE)
+			if (ofi_sockerr() == EADDRINUSE)
 				continue;
 
 			FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
 				"failed to bind listener: %s\n",
 				strerror(ofi_sockerr()));
-			return -errno;
+			return -ofi_sockerr();
 		}
 		break;
 	}
@@ -239,12 +319,15 @@ static int tcpx_pep_sock_create(struct tcpx_pep *pep)
 	if (ret) {
 		goto err;
 	}
-	if (ofi_addr_get_port(pep->info->src_addr) != 0 || port_range.high == 0)
+	if (ofi_addr_get_port(pep->info->src_addr) != 0 || port_range.high == 0) {
 		ret = bind(pep->sock, pep->info->src_addr,
 			  (socklen_t) pep->info->src_addrlen);
-	else
+		if (ret)
+			ret = -ofi_sockerr();
+	} else {
 		ret = tcpx_bind_to_port_range(pep->sock, pep->info->src_addr,
 					      pep->info->src_addrlen);
+	}
 
 	if (ret) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
@@ -344,42 +427,37 @@ static void tcpx_ep_tx_rx_queues_release(struct tcpx_ep *ep)
 	fastlock_release(&ep->lock);
 }
 
-/**
- * Release the ep from polling
- */
-void tcpx_ep_wait_fd_del(struct tcpx_ep *ep)
+static int tcpx_ep_close(struct fid *fid)
 {
-	FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "releasing ep=%p\n", ep);
-
+	struct tcpx_ep *ep;
 	struct tcpx_eq *eq;
 
-	eq = container_of(ep->util_ep.eq, struct tcpx_eq,
-			  util_eq);
+	ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid.fid);
+	eq = ep->util_ep.eq ?
+	     container_of(ep->util_ep.eq, struct tcpx_eq, util_eq) : NULL;
 
 	/* eq->close_lock protects from processing stale connection events */
-	fastlock_acquire(&eq->close_lock);
+	if (eq)
+		fastlock_acquire(&eq->close_lock);
+
 	if (ep->util_ep.rx_cq)
 		ofi_wait_del_fd(ep->util_ep.rx_cq->wait, ep->sock);
 
 	if (ep->util_ep.tx_cq)
 		ofi_wait_del_fd(ep->util_ep.tx_cq->wait, ep->sock);
 
-	if (ep->util_ep.eq->wait)
+	if (ep->util_ep.eq && ep->util_ep.eq->wait)
 		ofi_wait_del_fd(ep->util_ep.eq->wait, ep->sock);
 
-	fastlock_release(&eq->close_lock);
-}
-
-static int tcpx_ep_close(struct fid *fid)
-{
-	struct tcpx_ep *ep = container_of(fid, struct tcpx_ep,
-					  util_ep.ep_fid.fid);
+	if (eq)
+		fastlock_release(&eq->close_lock);
 
 	tcpx_ep_tx_rx_queues_release(ep);
 
-	tcpx_ep_wait_fd_del(ep); /* ensure that everything is really released */
-
-	ofi_eq_remove_fid_events(ep->util_ep.eq, &ep->util_ep.ep_fid.fid);
+	if (eq) {
+		ofi_eq_remove_fid_events(ep->util_ep.eq,
+					 &ep->util_ep.ep_fid.fid);
+	}
 	ofi_close_socket(ep->sock);
 	ofi_endpoint_close(&ep->util_ep);
 	fastlock_destroy(&ep->lock);
@@ -517,6 +595,7 @@ int tcpx_endpoint(struct fid_domain *domain, struct fi_info *info,
 			ep->sock = pep->sock;
 			pep->sock = INVALID_SOCKET;
 		} else {
+			ep->state = TCPX_RCVD_REQ;
 			handle = container_of(info->handle,
 					      struct tcpx_conn_handle, handle);
 			ep->sock = handle->sock;
@@ -540,15 +619,10 @@ int tcpx_endpoint(struct fid_domain *domain, struct fi_info *info,
 			goto err3;
 	}
 
-	ep->cm_state = TCPX_EP_CONNECTING;
 	ret = fastlock_init(&ep->lock);
 	if (ret)
 		goto err3;
 
-	ep->stage_buf.size = STAGE_BUF_SIZE;
-	ep->stage_buf.len = 0;
-	ep->stage_buf.off = 0;
-
 	slist_init(&ep->rx_queue);
 	slist_init(&ep->tx_queue);
 	slist_init(&ep->rma_read_queue);
@@ -565,11 +639,11 @@ int tcpx_endpoint(struct fid_domain *domain, struct fi_info *info,
 	(*ep_fid)->msg = &tcpx_msg_ops;
 	(*ep_fid)->rma = &tcpx_rma_ops;
 
-	ep->get_rx_entry[ofi_op_msg] = tcpx_get_rx_entry_op_msg;
-	ep->get_rx_entry[ofi_op_tagged] = tcpx_get_rx_entry_op_invalid;
-	ep->get_rx_entry[ofi_op_read_req] = tcpx_get_rx_entry_op_read_req;
-	ep->get_rx_entry[ofi_op_read_rsp] = tcpx_get_rx_entry_op_read_rsp;
-	ep->get_rx_entry[ofi_op_write] = tcpx_get_rx_entry_op_write;
+	ep->start_op[ofi_op_msg] = tcpx_op_msg;
+	ep->start_op[ofi_op_tagged] = tcpx_op_invalid;
+	ep->start_op[ofi_op_read_req] = tcpx_op_read_req;
+	ep->start_op[ofi_op_read_rsp] = tcpx_op_read_rsp;
+	ep->start_op[ofi_op_write] = tcpx_op_write;
 	return 0;
 err3:
 	ofi_close_socket(ep->sock);
@@ -670,7 +744,8 @@ static int tcpx_pep_listen(struct fid_pep *pep)
 
 	tcpx_pep = container_of(pep,struct tcpx_pep, util_pep.pep_fid);
 
-	if (listen(tcpx_pep->sock, SOMAXCONN)) {
+	/* arbitrary backlog value to support larger scale jobs */
+	if (listen(tcpx_pep->sock, 4096)) {
 		FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,
 			"socket listen failed\n");
 		return -ofi_sockerr();
@@ -782,8 +857,10 @@ int tcpx_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
 	_pep->util_pep.pep_fid.ops = &tcpx_pep_ops;
 
 	_pep->info = fi_dupinfo(info);
-	if (!_pep->info)
+	if (!_pep->info) {
+		ret = -FI_ENOMEM;
 		goto err2;
+	}
 
 	_pep->cm_ctx.fid = &_pep->util_pep.pep_fid.fid;
 	_pep->cm_ctx.type = SERVER_SOCK_ACCEPT;
diff --git a/deps/libfabric/prov/tcp/src/tcpx_init.c b/deps/libfabric/prov/tcp/src/tcpx_init.c
index 559fa8b162cceba89c553cdd75c2e5cfd9f39f92..7b5ed3b5849783a138c0519f64447906e621cda2 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_init.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_init.c
@@ -54,8 +54,6 @@ struct tcpx_port_range port_range = {
 
 static void tcpx_init_env(void)
 {
-	srand(getpid());
-
 	fi_param_get_int(&tcpx_prov, "port_high_range", &port_range.high);
 	fi_param_get_int(&tcpx_prov, "port_low_range", &port_range.low);
 
diff --git a/deps/libfabric/prov/tcp/src/tcpx_progress.c b/deps/libfabric/prov/tcp/src/tcpx_progress.c
index a7dcb8c3e8117f5e0b5892374f0a8b41340b7a79..8d955fb1a8badb57bf054c0f6a6e59f010914ad2 100644
--- a/deps/libfabric/prov/tcp/src/tcpx_progress.c
+++ b/deps/libfabric/prov/tcp/src/tcpx_progress.c
@@ -42,55 +42,6 @@
 #include <ofi_util.h>
 #include <ofi_iov.h>
 
-static void tcpx_cq_report_xfer_fail(struct tcpx_ep *tcpx_ep, int err)
-{
-	struct slist_entry *entry;
-	struct tcpx_xfer_entry *tx_entry;
-	struct tcpx_cq *tcpx_cq;
-
-	while (!slist_empty(&tcpx_ep->tx_rsp_pend_queue)) {
-		entry = slist_remove_head(&tcpx_ep->tx_rsp_pend_queue);
-		tx_entry = container_of(entry, struct tcpx_xfer_entry, entry);
-		tcpx_cq_report_error(tx_entry->ep->util_ep.tx_cq, tx_entry, -err);
-
-		tcpx_cq = container_of(tx_entry->ep->util_ep.tx_cq,
-				       struct tcpx_cq, util_cq);
-		tcpx_xfer_entry_release(tcpx_cq, tx_entry);
-	}
-}
-
-/**
- * Shutdown is done in two phases, phase1 writes the FI_SHUTDOWN event, which
- * a polling thread still needs to handle, phase2 removes the fd
- * of the ep from polling, so that a polling thread won't spin
- * if it does not close the connection immediately after it handled
- * FI_SHUTDOWN
- */
-int tcpx_ep_shutdown_report(struct tcpx_ep *ep, fid_t fid)
-{
-	struct fi_eq_cm_entry cm_entry = {0};
-	ssize_t len;
-
-	switch (ep->cm_state) {
-	case TCPX_EP_POLL_REMOVED:
-		break;
-	case TCPX_EP_SHUTDOWN:
-		tcpx_ep_wait_fd_del(ep);
-		ep->cm_state = TCPX_EP_POLL_REMOVED;
-		break;
-	default:
-		tcpx_cq_report_xfer_fail(ep, -FI_ENOTCONN);
-		ep->cm_state = TCPX_EP_SHUTDOWN;
-		cm_entry.fid = fid;
-		len =  fi_eq_write(&ep->util_ep.eq->eq_fid, FI_SHUTDOWN,
-				   &cm_entry, sizeof(cm_entry), 0);
-		if (len < 0)
-			return (int) len;
-		break;
-	}
-
-	return FI_SUCCESS;
-}
 
 static void process_tx_entry(struct tcpx_xfer_entry *tx_entry)
 {
@@ -107,8 +58,7 @@ static void process_tx_entry(struct tcpx_xfer_entry *tx_entry)
 
 	if (ret) {
 		FI_WARN(&tcpx_prov, FI_LOG_DOMAIN, "msg send failed\n");
-		tcpx_ep_shutdown_report(tx_entry->ep,
-					&tx_entry->ep->util_ep.ep_fid.fid);
+		tcpx_ep_disable(tx_entry->ep, 0);
 		tcpx_cq_report_error(tx_entry->ep->util_ep.tx_cq,
 				     tx_entry, -ret);
 	} else {
@@ -172,8 +122,7 @@ static int process_rx_entry(struct tcpx_xfer_entry *rx_entry)
 		FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
 			"msg recv Failed ret = %d\n", ret);
 
-		tcpx_ep_shutdown_report(rx_entry->ep,
-					&rx_entry->ep->util_ep.ep_fid.fid);
+		tcpx_ep_disable(rx_entry->ep, 0);
 		tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq, rx_entry, -ret);
 		tcpx_rx_msg_release(rx_entry);
 	} else if (rx_entry->hdr.base_hdr.flags & OFI_DELIVERY_COMPLETE) {
@@ -245,7 +194,7 @@ static void tcpx_pmem_commit(struct tcpx_xfer_entry *rx_entry)
 	}
 }
 
-static int process_rx_remote_write_entry(struct tcpx_xfer_entry *rx_entry)
+static int process_remote_write(struct tcpx_xfer_entry *rx_entry)
 {
 	struct tcpx_cq *tcpx_cq;
 	int ret = FI_SUCCESS;
@@ -259,8 +208,7 @@ static int process_rx_remote_write_entry(struct tcpx_xfer_entry *rx_entry)
 			"remote write Failed ret = %d\n",
 			ret);
 
-		tcpx_ep_shutdown_report(rx_entry->ep,
-					&rx_entry->ep->util_ep.ep_fid.fid);
+		tcpx_ep_disable(rx_entry->ep, 0);
 		tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq, rx_entry, -ret);
 		tcpx_cq = container_of(rx_entry->ep->util_ep.rx_cq,
 				       struct tcpx_cq, util_cq);
@@ -283,7 +231,7 @@ static int process_rx_remote_write_entry(struct tcpx_xfer_entry *rx_entry)
 	return ret;
 }
 
-static int process_rx_read_entry(struct tcpx_xfer_entry *rx_entry)
+static int process_remote_read(struct tcpx_xfer_entry *rx_entry)
 {
 	struct tcpx_cq *tcpx_cq;
 	int ret = FI_SUCCESS;
@@ -295,8 +243,7 @@ static int process_rx_read_entry(struct tcpx_xfer_entry *rx_entry)
 	if (ret) {
 		FI_WARN(&tcpx_prov, FI_LOG_DOMAIN,
 			"msg recv Failed ret = %d\n", ret);
-		tcpx_ep_shutdown_report(rx_entry->ep,
-					&rx_entry->ep->util_ep.ep_fid.fid);
+		tcpx_ep_disable(rx_entry->ep, 0);
 		tcpx_cq_report_error(rx_entry->ep->util_ep.tx_cq, rx_entry, -ret);
 	} else {
 		tcpx_cq_report_success(rx_entry->ep->util_ep.tx_cq, rx_entry);
@@ -391,20 +338,23 @@ static int tcpx_validate_rx_rma_data(struct tcpx_xfer_entry *rx_entry,
 	return FI_SUCCESS;
 }
 
-int tcpx_get_rx_entry_op_invalid(struct tcpx_ep *tcpx_ep)
+int tcpx_op_invalid(struct tcpx_ep *tcpx_ep)
 {
 	return -FI_EINVAL;
 }
 
-static inline void
-tcpx_rx_detect_init(struct tcpx_cur_rx_msg *cur_rx_msg)
-
+static void tcpx_rx_setup(struct tcpx_ep *ep, struct tcpx_xfer_entry *rx_entry,
+			  tcpx_rx_process_fn_t process_fn)
 {
-	cur_rx_msg->hdr_len = sizeof(cur_rx_msg->hdr.base_hdr);
-	cur_rx_msg->done_len = 0;
+	ep->cur_rx_entry = rx_entry;
+	ep->cur_rx_proc_fn = process_fn;
+
+	/* Reset to receive next message */
+	ep->cur_rx_msg.hdr_len = sizeof(ep->cur_rx_msg.hdr.base_hdr);
+	ep->cur_rx_msg.done_len = 0;
 }
 
-int tcpx_get_rx_entry_op_msg(struct tcpx_ep *tcpx_ep)
+int tcpx_op_msg(struct tcpx_ep *tcpx_ep)
 {
 	struct tcpx_xfer_entry *rx_entry;
 	struct tcpx_xfer_entry *tx_entry;
@@ -424,7 +374,7 @@ int tcpx_get_rx_entry_op_msg(struct tcpx_ep *tcpx_ep)
 
 		slist_remove_head(&tx_entry->ep->tx_rsp_pend_queue);
 		tcpx_xfer_entry_release(tcpx_cq, tx_entry);
-		tcpx_rx_detect_init(cur_rx_msg);
+		tcpx_rx_setup(tcpx_ep, NULL, NULL);
 		return -FI_EAGAIN;
 	}
 
@@ -466,16 +416,14 @@ int tcpx_get_rx_entry_op_msg(struct tcpx_ep *tcpx_ep)
 		return ret;
 	}
 
-	tcpx_ep->cur_rx_proc_fn = process_rx_entry;
 	if (cur_rx_msg->hdr.base_hdr.flags & OFI_REMOTE_CQ_DATA)
 		rx_entry->flags |= FI_REMOTE_CQ_DATA;
 
-	tcpx_rx_detect_init(cur_rx_msg);
-	tcpx_ep->cur_rx_entry = rx_entry;
+	tcpx_rx_setup(tcpx_ep, rx_entry, process_rx_entry);
 	return FI_SUCCESS;
 }
 
-int tcpx_get_rx_entry_op_read_req(struct tcpx_ep *tcpx_ep)
+int tcpx_op_read_req(struct tcpx_ep *tcpx_ep)
 {
 	struct tcpx_xfer_entry *rx_entry;
 	struct tcpx_cq *tcpx_cq;
@@ -507,13 +455,11 @@ int tcpx_get_rx_entry_op_read_req(struct tcpx_ep *tcpx_ep)
 		return ret;
 	}
 
-	tcpx_rx_detect_init(&tcpx_ep->cur_rx_msg);
-	tcpx_ep->cur_rx_entry = rx_entry;
-	tcpx_ep->cur_rx_proc_fn = tcpx_prepare_rx_remote_read_resp;
+	tcpx_rx_setup(tcpx_ep, rx_entry, tcpx_prepare_rx_remote_read_resp);
 	return FI_SUCCESS;
 }
 
-int tcpx_get_rx_entry_op_write(struct tcpx_ep *tcpx_ep)
+int tcpx_op_write(struct tcpx_ep *tcpx_ep)
 {
 	struct tcpx_xfer_entry *rx_entry;
 	struct tcpx_cq *tcpx_cq;
@@ -547,14 +493,12 @@ int tcpx_get_rx_entry_op_write(struct tcpx_ep *tcpx_ep)
 	}
 
 	tcpx_copy_rma_iov_to_msg_iov(rx_entry);
-	tcpx_rx_detect_init(&tcpx_ep->cur_rx_msg);
-	tcpx_ep->cur_rx_entry = rx_entry;
-	tcpx_ep->cur_rx_proc_fn = process_rx_remote_write_entry;
+	tcpx_rx_setup(tcpx_ep, rx_entry, process_remote_write);
 	return FI_SUCCESS;
 
 }
 
-int tcpx_get_rx_entry_op_read_rsp(struct tcpx_ep *tcpx_ep)
+int tcpx_op_read_rsp(struct tcpx_ep *tcpx_ep)
 {
 	struct tcpx_xfer_entry *rx_entry;
 	struct slist_entry *entry;
@@ -572,23 +516,40 @@ int tcpx_get_rx_entry_op_read_rsp(struct tcpx_ep *tcpx_ep)
 	rx_entry->rem_len = (rx_entry->hdr.base_hdr.size -
 			     tcpx_ep->cur_rx_msg.done_len);
 
-	tcpx_rx_detect_init(&tcpx_ep->cur_rx_msg);
-	tcpx_ep->cur_rx_entry = rx_entry;
-	tcpx_ep->cur_rx_proc_fn = process_rx_read_entry;
+	tcpx_rx_setup(tcpx_ep, rx_entry, process_remote_read);
 	return FI_SUCCESS;
 }
 
-static inline int tcpx_get_next_rx_hdr(struct tcpx_ep *ep)
+static int tcpx_get_next_rx_hdr(struct tcpx_ep *ep)
 {
-	int ret;
+	ssize_t ret;
 
-	/* hdr already read from socket in previous call */
-	if (ep->cur_rx_msg.hdr_len == ep->cur_rx_msg.done_len)
-		return FI_SUCCESS;
+	ret = tcpx_recv_hdr(ep->sock, &ep->stage_buf, &ep->cur_rx_msg);
+	if (ret < 0)
+		return (int) ret;
 
-	ret = tcpx_comm_recv_hdr(ep->sock, &ep->stage_buf, &ep->cur_rx_msg);
-	if (ret)
-		return ret;
+	ep->cur_rx_msg.done_len += ret;
+	if (ep->cur_rx_msg.done_len >= sizeof(ep->cur_rx_msg.hdr.base_hdr)) {
+		if (ep->cur_rx_msg.hdr.base_hdr.payload_off > TCPX_MAX_HDR_SZ) {
+			FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
+				"Payload offset is too large\n");
+			return -FI_EIO;
+		}
+		ep->cur_rx_msg.hdr_len = (size_t) ep->cur_rx_msg.hdr.
+						  base_hdr.payload_off;
+
+		if (ep->cur_rx_msg.hdr_len > ep->cur_rx_msg.done_len) {
+			ret = tcpx_recv_hdr(ep->sock, &ep->stage_buf,
+					    &ep->cur_rx_msg);
+			if (ret < 0)
+				return (int) ret;
+
+			ep->cur_rx_msg.done_len += ret;
+		}
+	}
+
+	if (ep->cur_rx_msg.done_len < ep->cur_rx_msg.hdr_len)
+		return -FI_EAGAIN;
 
 	ep->hdr_bswap(&ep->cur_rx_msg.hdr.base_hdr);
 	return FI_SUCCESS;
@@ -599,7 +560,8 @@ void tcpx_progress_rx(struct tcpx_ep *ep)
 {
 	int ret;
 
-	if (!ep->cur_rx_entry && (ep->stage_buf.len == ep->stage_buf.off)) {
+	if (!ep->cur_rx_entry &&
+	    (ep->stage_buf.cur_pos == ep->stage_buf.bytes_avail)) {
 		ret = tcpx_read_to_buffer(ep->sock, &ep->stage_buf);
 		if (ret)
 			goto err;
@@ -607,29 +569,35 @@ void tcpx_progress_rx(struct tcpx_ep *ep)
 
 	do {
 		if (!ep->cur_rx_entry) {
-			ret = tcpx_get_next_rx_hdr(ep);
-			if (ret)
+			if (ep->cur_rx_msg.done_len < ep->cur_rx_msg.hdr_len) {
+				ret = tcpx_get_next_rx_hdr(ep);
+				if (ret)
+					goto err;
+			}
+
+			if (ep->cur_rx_msg.hdr.base_hdr.op >=
+			    ARRAY_SIZE(ep->start_op)) {
+				FI_WARN(&tcpx_prov, FI_LOG_EP_DATA,
+					"Received invalid opcode\n");
+				ret = -FI_ENOTCONN; /* force shutdown */
 				goto err;
-
-			ret = ep->get_rx_entry[ep->cur_rx_msg.hdr.base_hdr.op](ep);
+			}
+			ret = ep->start_op[ep->cur_rx_msg.hdr.base_hdr.op](ep);
 			if (ret)
 				goto err;
 		}
-		assert(ep->cur_rx_proc_fn != NULL);
+		assert(ep->cur_rx_proc_fn);
 		ep->cur_rx_proc_fn(ep->cur_rx_entry);
 
-	} while (ep->stage_buf.len != ep->stage_buf.off);
+	} while (ep->stage_buf.cur_pos < ep->stage_buf.bytes_avail);
 
 	return;
 err:
 	if (OFI_SOCK_TRY_SND_RCV_AGAIN(-ret))
 		return;
 
-	/* Failed current RX entry should clean itself */
-	assert(!ep->cur_rx_entry);
-
 	if (ret == -FI_ENOTCONN)
-		tcpx_ep_shutdown_report(ep, &ep->util_ep.ep_fid.fid);
+		tcpx_ep_disable(ep, 0);
 }
 
 /* Must hold ep lock */
diff --git a/deps/libfabric/prov/udp/Makefile.include b/deps/libfabric/prov/udp/Makefile.include
index 6cd0428c5fb9805c7fc84c7d20e33e7aed89e799..c291b6568a136aea8fe88ed7556251a146160996 100644
--- a/deps/libfabric/prov/udp/Makefile.include
+++ b/deps/libfabric/prov/udp/Makefile.include
@@ -11,12 +11,11 @@ _udp_files = \
 if HAVE_UDP_DL
 pkglib_LTLIBRARIES += libudp-fi.la
 libudp_fi_la_SOURCES = $(_udp_files) $(common_srcs)
-libudp_fi_la_LIBADD = $(linkback) $(udp_shm_LIBS)
+libudp_fi_la_LIBADD = $(linkback)
 libudp_fi_la_LDFLAGS = -module -avoid-version -shared -export-dynamic
 libudp_fi_la_DEPENDENCIES = $(linkback)
 else !HAVE_UDP_DL
 src_libfabric_la_SOURCES += $(_udp_files)
-src_libfabric_la_LIBADD += $(udp_shm_LIBS)
 endif !HAVE_UDP_DL
 
 prov_install_man_pages += man/man7/fi_udp.7
diff --git a/deps/libfabric/prov/udp/configure.m4 b/deps/libfabric/prov/udp/configure.m4
index 8330abb2a1f06dcb2aa2cbdd65d7f1a542287d8e..3538cfc5a16275ebcfa11820715e0735d953e699 100644
--- a/deps/libfabric/prov/udp/configure.m4
+++ b/deps/libfabric/prov/udp/configure.m4
@@ -10,30 +10,10 @@ dnl
 AC_DEFUN([FI_UDP_CONFIGURE],[
 	# Determine if we can support the udp provider
 	udp_h_happy=0
-	udp_shm_happy=0
 	AS_IF([test x"$enable_udp" != x"no"],
 	      [AC_CHECK_HEADER([sys/socket.h], [udp_h_happy=1],
 	                       [udp_h_happy=0])
-
-
-	       # check if shm_open is already present
-	       AC_CHECK_FUNC([shm_open],
-			     [udp_shm_happy=1],
-			     [udp_shm_happy=0])
-
-	       # look for shm_open in librt if not already present
-	       AS_IF([test $udp_shm_happy -eq 0],
-		     [FI_CHECK_PACKAGE([udp_shm],
-				[sys/mman.h],
-				[rt],
-				[shm_open],
-				[],
-				[],
-				[],
-				[udp_shm_happy=1],
-				[udp_shm_happy=0])])
 	      ])
 
-	AS_IF([test $udp_h_happy -eq 1 && \
-	       test $udp_shm_happy -eq 1], [$1], [$2])
+	AS_IF([test $udp_h_happy -eq 1], [$1], [$2])
 ])
diff --git a/deps/libfabric/prov/udp/libfabric-udp.spec.in b/deps/libfabric/prov/udp/libfabric-udp.spec.in
index 9e11096ace38003a1e846ae3dccc00fb036d2fdd..0ad0d5c2b5462ef640e0f8f96f45a987b87bb034 100644
--- a/deps/libfabric/prov/udp/libfabric-udp.spec.in
+++ b/deps/libfabric/prov/udp/libfabric-udp.spec.in
@@ -1,11 +1,12 @@
 %{!?configopts: %global configopts LDFLAGS=-Wl,--build-id}
-%{!?provider: %define provider usnic}
-%{!?provider_formal: %define provider_formal usNIC}
+%{!?provider: %define provider udp}
+%{!?provider_formal: %define provider_formal udp}
 
 Name: libfabric-%{provider}
 Version: @VERSION@
 Release: 1%{?dist}
 Summary: Dynamic %{provider_formal} provider for user-space Open Fabric Interfaces
+
 Group: System Environment/Libraries
 License: GPLv2 or BSD
 Url: http://www.github.com/ofiwg/libfabric
diff --git a/deps/libfabric/prov/util/src/cuda_mem_monitor.c b/deps/libfabric/prov/util/src/cuda_mem_monitor.c
new file mode 100644
index 0000000000000000000000000000000000000000..7abfe6c8aee992a681d6c97fe2978f7706056541
--- /dev/null
+++ b/deps/libfabric/prov/util/src/cuda_mem_monitor.c
@@ -0,0 +1,151 @@
+/*
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ofi_mr.h"
+
+#if HAVE_LIBCUDA
+
+#include "ofi_hmem.h"
+
+static int cuda_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr,
+			     size_t len, union ofi_mr_hmem_info *hmem_info)
+{
+	CUresult ret;
+
+	ret = ofi_cuPointerGetAttribute(&hmem_info->cuda_id,
+					CU_POINTER_ATTRIBUTE_BUFFER_ID,
+					(CUdeviceptr)addr);
+	if (ret == CUDA_SUCCESS) {
+		FI_DBG(&core_prov, FI_LOG_MR,
+		       "Assigned CUDA buffer ID %lu to buffer %p\n",
+		       hmem_info->cuda_id, addr);
+		return FI_SUCCESS;
+	}
+
+	FI_WARN(&core_prov, FI_LOG_MR,
+		"Failed to get CUDA buffer ID for buffer %p len %lu\n"
+		"cuPointerGetAttribute() failed: %s:%s\n", addr, len,
+		ofi_cudaGetErrorName(ret), ofi_cudaGetErrorString(ret));
+
+	return -FI_EFAULT;
+}
+
+static void cuda_mm_unsubscribe(struct ofi_mem_monitor *monitor,
+				const void *addr, size_t len,
+				union ofi_mr_hmem_info *hmem_info)
+{
+	/* no-op */
+}
+
+static bool cuda_mm_valid(struct ofi_mem_monitor *monitor,
+			  const void *addr, size_t len,
+			  union ofi_mr_hmem_info *hmem_info)
+{
+	uint64_t id;
+	CUresult ret;
+
+	/* CUDA buffer IDs are associated for each CUDA monitor entry. If the
+	 * device pages backing the device virtual address change, a different
+	 * buffer ID is associated with this mapping.
+	 */
+	ret = ofi_cuPointerGetAttribute(&id, CU_POINTER_ATTRIBUTE_BUFFER_ID,
+					(CUdeviceptr)addr);
+	if (ret == CUDA_SUCCESS && hmem_info->cuda_id == id) {
+		FI_DBG(&core_prov, FI_LOG_MR,
+		       "CUDA buffer ID %lu still valid for buffer %p\n",
+		       hmem_info->cuda_id, addr);
+		return true;
+	} else if (ret == CUDA_SUCCESS && hmem_info->cuda_id != id) {
+		FI_DBG(&core_prov, FI_LOG_MR,
+		       "CUDA buffer ID %lu invalid for buffer %p\n",
+		       hmem_info->cuda_id, addr);
+	} else {
+		FI_WARN(&core_prov, FI_LOG_MR,
+			"Failed to get CUDA buffer ID for buffer %p len %lu\n"
+			"cuPointerGetAttribute() failed: %s:%s\n", addr, len,
+			ofi_cudaGetErrorName(ret), ofi_cudaGetErrorString(ret));
+	}
+
+	return false;
+}
+
+static int cuda_monitor_start(struct ofi_mem_monitor *monitor)
+{
+	/* no-op */
+	return FI_SUCCESS;
+}
+
+#else
+
+static int cuda_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr,
+			     size_t len, union ofi_mr_hmem_info *hmem_info)
+{
+	return -FI_ENOSYS;
+}
+
+static void cuda_mm_unsubscribe(struct ofi_mem_monitor *monitor,
+				const void *addr, size_t len,
+				union ofi_mr_hmem_info *hmem_info)
+{
+}
+
+static bool cuda_mm_valid(struct ofi_mem_monitor *monitor,
+			  const void *addr, size_t len,
+			  union ofi_mr_hmem_info *hmem_info)
+{
+	return false;
+}
+
+static int cuda_monitor_start(struct ofi_mem_monitor *monitor)
+{
+	return -FI_ENOSYS;
+}
+
+#endif /* HAVE_LIBCUDA */
+
+void cuda_monitor_stop(struct ofi_mem_monitor *monitor)
+{
+	/* no-op */
+}
+
+static struct ofi_mem_monitor cuda_mm = {
+	.iface = FI_HMEM_CUDA,
+	.init = ofi_monitor_init,
+	.cleanup = ofi_monitor_cleanup,
+	.start = cuda_monitor_start,
+	.stop = cuda_monitor_stop,
+	.subscribe = cuda_mm_subscribe,
+	.unsubscribe = cuda_mm_unsubscribe,
+	.valid = cuda_mm_valid,
+};
+
+struct ofi_mem_monitor *cuda_monitor = &cuda_mm;
diff --git a/deps/libfabric/prov/util/src/rocr_mem_monitor.c b/deps/libfabric/prov/util/src/rocr_mem_monitor.c
new file mode 100644
index 0000000000000000000000000000000000000000..09b8076b4ab76908fceb5be631972cfcde292881
--- /dev/null
+++ b/deps/libfabric/prov/util/src/rocr_mem_monitor.c
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2020 Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ofi_mr.h"
+
+#ifdef HAVE_ROCR
+
+#include "ofi_tree.h"
+#include "ofi_iov.h"
+
+#include <hsa/hsa_ext_amd.h>
+
+struct rocr_mm_entry {
+	struct iovec iov;
+	struct ofi_rbnode *node;
+};
+
+struct rocr_mm {
+	struct ofi_mem_monitor mm;
+	struct ofi_rbmap *dev_region_tree;
+};
+
+static int rocr_mm_start(struct ofi_mem_monitor *monitor);
+static void rocr_mm_stop(struct ofi_mem_monitor *monitor);
+static int rocr_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr,
+			     size_t len, union ofi_mr_hmem_info *hmem_info);
+static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor,
+				const void *addr, size_t len,
+				union ofi_mr_hmem_info *hmem_info);
+static bool rocr_mm_valid(struct ofi_mem_monitor *monitor, const void *addr,
+			  size_t len, union ofi_mr_hmem_info *hmem_info);
+
+static struct rocr_mm rocr_mm = {
+	.mm = {
+		.iface = FI_HMEM_ROCR,
+		.init = ofi_monitor_init,
+		.cleanup = ofi_monitor_cleanup,
+		.start = rocr_mm_start,
+		.stop = rocr_mm_stop,
+		.subscribe = rocr_mm_subscribe,
+		.unsubscribe = rocr_mm_unsubscribe,
+		.valid = rocr_mm_valid,
+	},
+};
+
+struct ofi_mem_monitor *rocr_monitor = &rocr_mm.mm;
+
+static int rocr_rbmap_compare(struct ofi_rbmap *map, void *key, void *data)
+{
+	struct rocr_mm_entry *entry = data;
+	struct iovec *iov = key;
+
+	if (ofi_iov_left(&entry->iov, iov))
+		return -1;
+	else if (ofi_iov_right(&entry->iov, iov))
+		return 1;
+
+	/* If this fails, the ROCR memory monitor failed to have a single ROCR
+	 * memory monitor entry per user allocated ROCR memory region.
+	 */
+	assert(ofi_iov_within(iov, &entry->iov));
+
+	return 0;
+}
+
+static struct rocr_mm_entry *rocr_mm_entry_get_root(void)
+{
+	struct ofi_rbnode *node;
+
+	node = ofi_rbmap_get_root(rocr_mm.dev_region_tree);
+	if (node)
+		return node->data;
+	return NULL;
+}
+
+/* ROCR memory monitor entry find works by finding the node within the device
+ * region tree which contains the address within an entry's monitored range.
+ * Thus, we only need an address instead of an address and length when
+ * searching.
+ */
+static struct rocr_mm_entry *rocr_mm_entry_find(const void *addr)
+{
+	struct ofi_rbnode *node;
+	struct iovec iov = {
+		.iov_base = (void *) addr,
+		.iov_len = 1,
+	};
+
+	node = ofi_rbmap_find(rocr_mm.dev_region_tree, (void *) &iov);
+	if (node)
+		return node->data;
+	return NULL;
+}
+
+/* Pointer to ROCR memory monitor entry can never be returned as user data. This
+ * could lead to use-after-free. Instead, address and length is always returned.
+ * Unsubscribe will attempt to lookup the corresponding ROCR memory monitor
+ * entry and will free the entry if found.
+ */
+static void rocr_mm_dealloc_cb(void *addr, void *user_data)
+{
+	size_t len = (size_t) user_data;
+
+	pthread_rwlock_rdlock(&mm_list_rwlock);
+	pthread_mutex_lock(&mm_lock);
+	ofi_monitor_unsubscribe(rocr_monitor, addr, len, NULL);
+	pthread_mutex_unlock(&mm_lock);
+	pthread_rwlock_unlock(&mm_list_rwlock);
+}
+
+static void rocr_mm_entry_free(struct rocr_mm_entry *entry)
+{
+	hsa_status_t hsa_ret __attribute__((unused));
+
+	FI_DBG(&core_prov, FI_LOG_MR,
+	       "ROCR buffer address %p length %lu monitor entry freed\n",
+	       entry->iov.iov_base, entry->iov.iov_len);
+
+	/* Two return codes are expected. HSA_STATUS_SUCCESS is returned if the
+	 * deallocation callback was not triggered and the entry is freed.
+	 * HSA_STATUS_ERROR_INVALID_ARGUMENT is returned if the deallocation
+	 * callback was triggered and the entry is freed. Any other return code
+	 * puts the monitor in an unknown state and is fatal.
+	 */
+	hsa_ret = ofi_hsa_amd_dereg_dealloc_cb(entry->iov.iov_base,
+					       rocr_mm_dealloc_cb);
+	assert(hsa_ret == HSA_STATUS_SUCCESS ||
+	       hsa_ret == HSA_STATUS_ERROR_INVALID_ARGUMENT);
+
+	ofi_rbmap_delete(rocr_mm.dev_region_tree, entry->node);
+	free(entry);
+}
+
+/* Each ROCR memory monitor entry is sized to the entire user-allocated ROCR
+ * memory region. A single deallocation callback is registered for the memory
+ * region. This callback is called when the user frees the ROCR memory region.
+ */
+static int rocr_mm_entry_alloc(const void *addr, struct rocr_mm_entry **entry)
+{
+	hsa_amd_pointer_info_t hsa_info = {
+		.size = sizeof(hsa_info),
+	};
+	hsa_status_t hsa_ret;
+	int ret;
+
+	*entry = malloc(sizeof(**entry));
+	if (!*entry) {
+		ret = -FI_ENOMEM;
+		goto err;
+	}
+
+	/* Determine full ROCR memory region size. */
+	hsa_ret = ofi_hsa_amd_pointer_info((void *) addr, &hsa_info, NULL, NULL,
+					   NULL);
+	if (hsa_ret != HSA_STATUS_SUCCESS) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to perform hsa_amd_pointer_info: %s\n",
+			ofi_hsa_status_to_string(hsa_ret));
+		ret = -FI_EIO;
+		goto err_free_entry;
+	}
+
+	if (hsa_info.type != HSA_EXT_POINTER_TYPE_HSA) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Cannot monitor non-HSA allocated memory\n");
+		ret = -FI_EINVAL;
+		goto err_free_entry;
+	}
+
+	(*entry)->iov.iov_base = hsa_info.agentBaseAddress;
+	(*entry)->iov.iov_len = hsa_info.sizeInBytes;
+
+	ret = ofi_rbmap_insert(rocr_mm.dev_region_tree,
+			       (void *) &(*entry)->iov,
+			       (void *) *entry, &(*entry)->node);
+	if (ret) {
+		FI_WARN(&core_prov, FI_LOG_MR,
+			"Failed to insert into RB tree: %s\n", strerror(ret));
+		goto err_free_entry;
+	}
+
+	/* Register a deallocation callback for this ROCR memory region. */
+	hsa_ret = ofi_hsa_amd_reg_dealloc_cb(hsa_info.agentBaseAddress,
+					     rocr_mm_dealloc_cb,
+					     (void *) hsa_info.sizeInBytes);
+	if (hsa_ret != HSA_STATUS_SUCCESS) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to perform hsa_amd_register_deallocation_callback: %s\n",
+			ofi_hsa_status_to_string(hsa_ret));
+
+		ret = -FI_EIO;
+		goto err_rbmap_delete_entry;
+	}
+
+	FI_DBG(&core_prov, FI_LOG_MR,
+	       "ROCR buffer address %p length %lu monitor entry allocated\n",
+	       hsa_info.agentBaseAddress, hsa_info.sizeInBytes);
+
+	return FI_SUCCESS;
+
+err_rbmap_delete_entry:
+	ofi_rbmap_delete(rocr_mm.dev_region_tree, (*entry)->node);
+err_free_entry:
+	free(*entry);
+err:
+	*entry = NULL;
+	return ret;
+}
+
+static int rocr_mm_start(struct ofi_mem_monitor *monitor)
+{
+	rocr_mm.dev_region_tree = ofi_rbmap_create(rocr_rbmap_compare);
+	if (!rocr_mm.dev_region_tree)
+		return -FI_ENOMEM;
+	return FI_SUCCESS;
+}
+
+static void rocr_mm_stop(struct ofi_mem_monitor *monitor)
+{
+	struct rocr_mm_entry *entry;
+
+	while ((entry = rocr_mm_entry_get_root()))
+		rocr_mm_entry_free(entry);
+
+	assert(ofi_rbmap_empty(rocr_mm.dev_region_tree));
+
+	ofi_rbmap_destroy(rocr_mm.dev_region_tree);
+}
+
+static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor,
+				const void *addr, size_t len,
+				union ofi_mr_hmem_info *hmem_info)
+{
+	struct rocr_mm_entry *entry;
+	size_t cur_len = len;
+	void *cur_addr = (void *) addr;
+	void *next_addr;
+
+	/* The user unsubscribe region may span multiple ROCR memory regions.
+	 * Each ROCR memory region needs to be freed and MR caches notified.
+	 */
+	while (cur_len) {
+		entry = rocr_mm_entry_find(cur_addr);
+		if (!entry)
+			break;
+
+		ofi_monitor_notify(rocr_monitor, entry->iov.iov_base,
+				   entry->iov.iov_len);
+
+		FI_DBG(&core_prov, FI_LOG_MR,
+		       "ROCR buffer address %p length %lu unsubscribed\n",
+		       entry->iov.iov_base, entry->iov.iov_len);
+
+		next_addr = (void *) ((uintptr_t) ofi_iov_end(&entry->iov) + 1);
+
+		rocr_mm_entry_free(entry);
+
+		cur_len -= MIN((uintptr_t) next_addr - (uintptr_t) cur_addr,
+			       cur_len);
+		cur_addr = next_addr;
+	}
+
+	if (cur_len)
+		FI_WARN(&core_prov, FI_LOG_MR,
+			"Failed to completely unsubscribe from address %p length %lu\n",
+			addr, len);
+}
+
+/* Subscribe is designed to monitor entire ROCR memory regions even if the user
+ * subscribe region is smaller. All ROCR memory regions are inserted into an RB
+ * tree for tracking. Future subscriptions will always reuse RB tree entries if
+ * possible.
+ *
+ * RB tree entries can be removed in two different ways:
+ * 1. An unsubscribe against the memory region occurs. This will occur when ROCR
+ *    invokes the deregistration callback.
+ * 2. The ROCR memory monitor is stopped.
+ *
+ * Note: The ROCR memory monitor does not impose a limit on the number of ROCR
+ * memory regions which can be monitored.
+ */
+static int rocr_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr,
+			     size_t len, union ofi_mr_hmem_info *hmem_info)
+{
+	struct rocr_mm_entry *entry;
+	int ret = FI_SUCCESS;
+	size_t cur_len = len;
+	void *cur_addr = (void *) addr;
+	void *next_addr;
+
+	/* The user subscribe region may span multiple ROCR memory regions. For
+	 * this case, each ROCR memory region needs to be monitored. This
+	 * requires allocating a ROCR memory monitor entry for each ROCR memory
+	 * region.
+	 */
+	while (cur_len) {
+		entry = rocr_mm_entry_find(cur_addr);
+		if (entry) {
+			FI_DBG(&core_prov, FI_LOG_MR,
+			"Reusing monitored ROCR buffer address %p length %lu\n",
+			entry->iov.iov_base, entry->iov.iov_len);
+		} else {
+			/* On error, previous allocated entries are not cleaned
+			 * up. This is harmless since these entries will either
+			 * be cleaned up when the user frees the ROCR memory
+			 * region or the memory monitor is stopped.
+			 */
+			ret = rocr_mm_entry_alloc(cur_addr, &entry);
+			if (ret != FI_SUCCESS)
+				break;
+		}
+
+		next_addr = (void *) ((uintptr_t) ofi_iov_end(&entry->iov) + 1);
+		cur_len -= MIN((uintptr_t) next_addr - (uintptr_t) cur_addr,
+			       cur_len);
+		cur_addr = next_addr;
+	}
+
+	FI_LOG(&core_prov, ret ? FI_LOG_WARN : FI_LOG_DEBUG, FI_LOG_MR,
+	       "ROCR buffer address %p length %lu subscribe status: %s\n", addr,
+	       len, fi_strerror(-ret));
+
+	return ret;
+}
+
+static bool rocr_mm_valid(struct ofi_mem_monitor *monitor, const void *addr,
+			  size_t len, union ofi_mr_hmem_info *hmem_info)
+{
+	/* no-op */
+	return true;
+}
+
+#else
+
+static int rocr_mm_start(struct ofi_mem_monitor *monitor)
+{
+	return -FI_ENOSYS;
+}
+
+static void rocr_mm_stop(struct ofi_mem_monitor *monitor)
+{
+}
+
+static int rocr_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr,
+			     size_t len, union ofi_mr_hmem_info *hmem_info)
+{
+	return -FI_ENOSYS;
+}
+
+static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor,
+				const void *addr, size_t len,
+				union ofi_mr_hmem_info *hmem_info)
+{
+}
+
+static bool rocr_mm_valid(struct ofi_mem_monitor *monitor, const void *addr,
+			  size_t len, union ofi_mr_hmem_info *hmem_info)
+{
+	return false;
+}
+
+static struct ofi_mem_monitor rocr_mm = {
+	.iface = FI_HMEM_ROCR,
+	.init = ofi_monitor_init,
+	.cleanup = ofi_monitor_cleanup,
+	.start = rocr_mm_start,
+	.stop = rocr_mm_stop,
+	.subscribe = rocr_mm_subscribe,
+	.unsubscribe = rocr_mm_unsubscribe,
+	.valid = rocr_mm_valid,
+};
+
+struct ofi_mem_monitor *rocr_monitor = &rocr_mm;
+
+#endif /* HAVE_ROCR */
diff --git a/deps/libfabric/prov/util/src/util_atomic.c b/deps/libfabric/prov/util/src/util_atomic.c
index 5a032b1c8c4b3c6a8d8f40490279292a9afb4a06..a95057011ca68fba0ecef105f5e9eff1ecc4ac87 100644
--- a/deps/libfabric/prov/util/src/util_atomic.c
+++ b/deps/libfabric/prov/util/src/util_atomic.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2013-2017 Intel Corporation. All rights reserved.
  * Copyright (c) 2018 Cray Inc. All rights reserved.
+ * Copyright (c) 2018 System Fabric Works, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -820,7 +821,7 @@ OFI_DEFINE_ALL_HANDLERS(WRITEEXT, FUNC, OFI_OP_LXOR)
 OFI_DEFINE_INT_HANDLERS(WRITE, FUNC, OFI_OP_BXOR)
 OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_WRITE)
 
-void (*ofi_atomic_write_handlers[OFI_WRITE_OP_LAST][FI_DATATYPE_LAST])
+void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST])
 	(void *dst, const void *src, size_t cnt) =
 {
 	{ OFI_DEFINE_REALNO_HANDLERS(WRITEEXT_CMP, NAME, OFI_OP_MIN) },
@@ -854,7 +855,7 @@ OFI_DEFINE_INT_HANDLERS(READWRITE, FUNC, OFI_OP_BXOR)
 OFI_DEFINE_ALL_HANDLERS(READ, FUNC, OFI_OP_READ)
 OFI_DEFINE_ALL_HANDLERS(EXCHANGE, FUNC, OFI_OP_READWRITE)
 
-void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_LAST][FI_DATATYPE_LAST])
+void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST])
 	(void *dst, const void *src, void *res, size_t cnt) =
 {
 	{ OFI_DEFINE_REALNO_HANDLERS(READWRITEEXT_CMP, NAME, OFI_OP_MIN) },
@@ -883,7 +884,7 @@ OFI_DEFINE_REALNO_HANDLERS(CSWAPEXT_CMP, FUNC, OFI_OP_CSWAP_GE)
 OFI_DEFINE_REALNO_HANDLERS(CSWAPEXT_CMP, FUNC, OFI_OP_CSWAP_GT)
 OFI_DEFINE_INT_HANDLERS(CSWAPEXT, FUNC, OFI_OP_MSWAP)
 
-void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_LAST][FI_DATATYPE_LAST])
+void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST])
 	(void *dst, const void *src, const void *cmp, void *res, size_t cnt) =
 {
 	{ OFI_DEFINE_ALL_HANDLERS(CSWAP, NAME, OFI_OP_CSWAP_EQ) },
@@ -918,7 +919,7 @@ OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_LXOR)
 OFI_DEFINE_INT_HANDLERS(WRITE, FUNC, OFI_OP_BXOR)
 OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_WRITE)
 
-void (*ofi_atomic_write_handlers[OFI_WRITE_OP_LAST][FI_DATATYPE_LAST])
+void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST])
 	(void *dst, const void *src, size_t cnt) =
 {
 	{ OFI_DEFINE_REALNO_HANDLERS(WRITE, NAME, OFI_OP_MIN) },
@@ -952,7 +953,7 @@ OFI_DEFINE_INT_HANDLERS(READWRITE, FUNC, OFI_OP_BXOR)
 OFI_DEFINE_ALL_HANDLERS(READ, FUNC, OFI_OP_READ)
 OFI_DEFINE_ALL_HANDLERS(READWRITE, FUNC, OFI_OP_WRITE)
 
-void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_LAST][FI_DATATYPE_LAST])
+void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST])
 	(void *dst, const void *src, void *res, size_t cnt) =
 {
 	{ OFI_DEFINE_REALNO_HANDLERS(READWRITE, NAME, OFI_OP_MIN) },
@@ -981,7 +982,7 @@ OFI_DEFINE_REALNO_HANDLERS(CSWAP, FUNC, OFI_OP_CSWAP_GE)
 OFI_DEFINE_REALNO_HANDLERS(CSWAP, FUNC, OFI_OP_CSWAP_GT)
 OFI_DEFINE_INT_HANDLERS(CSWAP, FUNC, OFI_OP_MSWAP)
 
-void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_LAST][FI_DATATYPE_LAST])
+void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST])
 	(void *dst, const void *src, const void *cmp, void *res, size_t cnt) =
 {
 	{ OFI_DEFINE_ALL_HANDLERS(CSWAP, NAME, OFI_OP_CSWAP_EQ) },
@@ -1028,19 +1029,19 @@ int ofi_atomic_valid(const struct fi_provider *prov,
 	}
 
 	if (flags & FI_FETCH_ATOMIC) {
-		if (op >= OFI_READWRITE_OP_LAST) {
+		if (!ofi_atomic_isreadwrite_op(op)) {
 			FI_INFO(prov, FI_LOG_DOMAIN, "Invalid fetch operation\n");
 			return -FI_EOPNOTSUPP;
 		}
 		have_func = ofi_atomic_readwrite_handlers[op][datatype] != NULL;
 	} else if (flags & FI_COMPARE_ATOMIC) {
-		if (op < FI_CSWAP || op > FI_MSWAP) {
+		if (!ofi_atomic_isswap_op(op)) {
 			FI_INFO(prov, FI_LOG_DOMAIN, "Invalid swap operation\n");
 			return -FI_EOPNOTSUPP;
 		}
 		have_func = ofi_atomic_swap_handlers[op - FI_CSWAP][datatype] != NULL;
 	} else {
-		if (op >= OFI_WRITE_OP_LAST) {
+		if (!ofi_atomic_iswrite_op(op)) {
 			FI_INFO(prov, FI_LOG_DOMAIN, "Invalid write operation\n");
 			return -FI_EOPNOTSUPP;
 		}
diff --git a/deps/libfabric/prov/util/src/util_attr.c b/deps/libfabric/prov/util/src/util_attr.c
index 24815a1d64fb84b20b8115d10f67d78ec3a0663f..84fb143ed1b20e88f89789a0ad3900c85c171b8c 100644
--- a/deps/libfabric/prov/util/src/util_attr.c
+++ b/deps/libfabric/prov/util/src/util_attr.c
@@ -151,8 +151,30 @@ static int ofi_dup_addr(const struct fi_info *info, struct fi_info *dup)
 	return 0;
 }
 
+static int ofi_set_prov_name(const struct fi_provider *prov,
+			     const struct fi_fabric_attr *util_hints,
+			     const struct fi_info *base_attr,
+			     struct fi_fabric_attr *core_hints)
+{
+	if (util_hints->prov_name) {
+		core_hints->prov_name = strdup(util_hints->prov_name);
+		if (!core_hints->prov_name)
+			return -FI_ENOMEM;
+	} else if (base_attr && base_attr->fabric_attr &&
+		   base_attr->fabric_attr->prov_name) {
+		core_hints->prov_name = strdup(base_attr->fabric_attr->
+					       prov_name);
+		if (!core_hints->prov_name)
+			return -FI_ENOMEM;
+	}
+
+	return core_hints->prov_name ?
+	       ofi_exclude_prov_name(&core_hints->prov_name, prov->name) : 0;
+}
+
 static int ofi_info_to_core(uint32_t version, const struct fi_provider *prov,
-			    const struct fi_info *util_info,
+			    const struct fi_info *util_hints,
+			    const struct fi_info *base_attr,
 			    ofi_alter_info_t info_to_core,
 			    struct fi_info **core_hints)
 {
@@ -161,19 +183,19 @@ static int ofi_info_to_core(uint32_t version, const struct fi_provider *prov,
 	if (!(*core_hints = fi_allocinfo()))
 		return -FI_ENOMEM;
 
-	if (info_to_core(version, util_info, *core_hints))
+	if (info_to_core(version, util_hints, base_attr, *core_hints))
 		goto err;
 
-	if (!util_info)
+	if (!util_hints)
 		return 0;
 
-	if (ofi_dup_addr(util_info, *core_hints))
+	if (ofi_dup_addr(util_hints, *core_hints))
 		goto err;
 
-	if (util_info->fabric_attr) {
-		if (util_info->fabric_attr->name) {
+	if (util_hints->fabric_attr) {
+		if (util_hints->fabric_attr->name) {
 			(*core_hints)->fabric_attr->name =
-				strdup(util_info->fabric_attr->name);
+				strdup(util_hints->fabric_attr->name);
 			if (!(*core_hints)->fabric_attr->name) {
 				FI_WARN(prov, FI_LOG_FABRIC,
 					"Unable to allocate fabric name\n");
@@ -181,25 +203,15 @@ static int ofi_info_to_core(uint32_t version, const struct fi_provider *prov,
 			}
 		}
 
-		if (util_info->fabric_attr->prov_name) {
-			(*core_hints)->fabric_attr->prov_name =
-				strdup(util_info->fabric_attr->prov_name);
-			if (!(*core_hints)->fabric_attr->prov_name) {
-				FI_WARN(prov, FI_LOG_FABRIC,
-					"Unable to alloc prov name\n");
-				goto err;
-			}
-			ret = ofi_exclude_prov_name(
-					&(*core_hints)->fabric_attr->prov_name,
-					prov->name);
-			if (ret)
-				goto err;
-		}
+		ret = ofi_set_prov_name(prov, util_hints->fabric_attr,
+					base_attr, (*core_hints)->fabric_attr);
+		if (ret)
+			goto err;
 	}
 
-	if (util_info->domain_attr && util_info->domain_attr->name) {
+	if (util_hints->domain_attr && util_hints->domain_attr->name) {
 		(*core_hints)->domain_attr->name =
-			strdup(util_info->domain_attr->name);
+			strdup(util_hints->domain_attr->name);
 		if (!(*core_hints)->domain_attr->name) {
 			FI_WARN(prov, FI_LOG_FABRIC,
 				"Unable to allocate domain name\n");
@@ -214,14 +226,14 @@ err:
 }
 
 static int ofi_info_to_util(uint32_t version, const struct fi_provider *prov,
-			    struct fi_info *core_info,
+			    struct fi_info *core_info, const struct fi_info *base_info,
 			    ofi_alter_info_t info_to_util,
 			    struct fi_info **util_info)
 {
 	if (!(*util_info = fi_allocinfo()))
 		return -FI_ENOMEM;
 
-	if (info_to_util(version, core_info, *util_info))
+	if (info_to_util(version, core_info, base_info, *util_info))
 		goto err;
 
 	if (ofi_dup_addr(core_info, *util_info))
@@ -268,18 +280,15 @@ err:
 
 int ofi_get_core_info(uint32_t version, const char *node, const char *service,
 		      uint64_t flags, const struct util_prov *util_prov,
-		      const struct fi_info *util_hints, ofi_alter_info_t info_to_core,
-		      struct fi_info **core_info)
+		      const struct fi_info *util_hints,
+		      const struct fi_info *base_attr,
+		      ofi_alter_info_t info_to_core, struct fi_info **core_info)
 {
 	struct fi_info *core_hints = NULL;
 	int ret;
 
-	ret = ofi_prov_check_info(util_prov, version, util_hints);
-	if (ret)
-		return ret;
-
-	ret = ofi_info_to_core(version, util_prov->prov, util_hints, info_to_core,
-			       &core_hints);
+	ret = ofi_info_to_core(version, util_prov->prov, util_hints, base_attr,
+			       info_to_core, &core_hints);
 	if (ret)
 		return ret;
 
@@ -299,31 +308,42 @@ int ofix_getinfo(uint32_t version, const char *node, const char *service,
 		 const struct fi_info *hints, ofi_alter_info_t info_to_core,
 		 ofi_alter_info_t info_to_util, struct fi_info **info)
 {
-	struct fi_info *core_info, *util_info, *cur, *tail;
-	int ret;
-
-	ret = ofi_get_core_info(version, node, service, flags, util_prov,
-				hints, info_to_core, &core_info);
-	if (ret)
-		return ret;
+	struct fi_info *core_info, *base_info, *util_info, *cur, *tail;
+	int ret = -FI_ENODATA;
 
 	*info = tail = NULL;
-	for (cur = core_info; cur; cur = cur->next) {
-		ret = ofi_info_to_util(version, util_prov->prov, cur,
-				       info_to_util, &util_info);
+	for (base_info = (struct fi_info *) util_prov->info; base_info;
+	     base_info = base_info->next) {
+		if (ofi_check_info(util_prov, base_info, version, hints))
+			continue;
+
+		ret = ofi_get_core_info(version, node, service, flags,
+					util_prov, hints, base_info,
+					info_to_core, &core_info);
 		if (ret) {
-			fi_freeinfo(*info);
+			if (ret == -FI_ENODATA)
+				continue;
 			break;
 		}
 
-		ofi_alter_info(util_info, hints, version);
-		if (!*info)
-			*info = util_info;
-		else
-			tail->next = util_info;
-		tail = util_info;
+		for (cur = core_info; cur; cur = cur->next) {
+			ret = ofi_info_to_util(version, util_prov->prov, cur,
+					       base_info, info_to_util,
+					       &util_info);
+			if (ret) {
+				fi_freeinfo(*info);
+				break;
+			}
+
+			ofi_alter_info(util_info, hints, version);
+			if (!*info)
+				*info = util_info;
+			else
+				tail->next = util_info;
+			tail = util_info;
+		}
+		fi_freeinfo(core_info);
 	}
-	fi_freeinfo(core_info);
 	return ret;
 }
 
@@ -371,7 +391,18 @@ int ofi_check_fabric_attr(const struct fi_provider *prov,
 			  const struct fi_fabric_attr *prov_attr,
 			  const struct fi_fabric_attr *user_attr)
 {
-	/* Provider names are checked by the framework */
+	/* Provider names are properly checked by the framework.
+	 * Here we only apply a simple filter.  If the util provider has
+	 * supplied a core provider name, verify that it is also in the
+	 * user's hints, if one is specified.
+	 */
+	if (prov_attr->prov_name && user_attr->prov_name &&
+	    !strcasestr(user_attr->prov_name, prov_attr->prov_name)) {
+		FI_INFO(prov, FI_LOG_CORE,
+			"Requesting provider %s, skipping %s\n",
+			prov_attr->prov_name, user_attr->prov_name);
+		return -FI_ENODATA;
+	}
 
 	if (user_attr->prov_version > prov_attr->prov_version) {
 		FI_INFO(prov, FI_LOG_CORE, "Unsupported provider version\n");
@@ -448,6 +479,9 @@ static int fi_resource_mgmt_level(enum fi_resource_mgmt rm_model)
  */
 static int ofi_cap_mr_mode(uint64_t info_caps, int mr_mode)
 {
+	if (!(info_caps & FI_HMEM))
+		mr_mode &= ~FI_MR_HMEM;
+
 	if (!ofi_rma_target_allowed(info_caps)) {
 		if (!(mr_mode & (FI_MR_LOCAL | FI_MR_HMEM)))
 			return 0;
@@ -729,6 +763,14 @@ int ofi_check_ep_attr(const struct util_prov *util_prov, uint32_t api_version,
 		return -FI_ENODATA;
 	}
 
+	if ((user_info->caps & FI_TAGGED) && user_attr->mem_tag_format &&
+	    ofi_max_tag(user_attr->mem_tag_format) >
+		    ofi_max_tag(prov_attr->mem_tag_format)) {
+		FI_INFO(prov, FI_LOG_CORE, "Tag size exceeds supported size\n");
+		FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, mem_tag_format);
+		return -FI_ENODATA;
+	}
+
 	return 0;
 }
 
@@ -1080,7 +1122,10 @@ static void fi_alter_domain_attr(struct fi_domain_attr *attr,
 		attr->mr_mode = (attr->mr_mode && attr->mr_mode != FI_MR_SCALABLE) ?
 				FI_MR_BASIC : FI_MR_SCALABLE;
 	} else {
-		if ((hints_mr_mode & attr->mr_mode) != attr->mr_mode) {
+		attr->mr_mode &= ~(FI_MR_BASIC | FI_MR_SCALABLE);
+
+		if (hints &&
+		    ((hints_mr_mode & attr->mr_mode) != attr->mr_mode)) {
 			attr->mr_mode = ofi_cap_mr_mode(info_caps,
 						attr->mr_mode & hints_mr_mode);
 		}
@@ -1163,7 +1208,8 @@ static uint64_t ofi_get_info_caps(const struct fi_info *prov_info,
 	int prov_mode, user_mode;
 	uint64_t caps;
 
-	assert(user_info);
+	if (!user_info)
+		return prov_info->caps;
 
 	caps = ofi_get_caps(prov_info->caps, user_info->caps, prov_info->caps);
 
@@ -1181,7 +1227,7 @@ static uint64_t ofi_get_info_caps(const struct fi_info *prov_info,
 	if ((FI_VERSION_LT(api_version, FI_VERSION(1,5)) &&
 	    (user_mode == FI_MR_UNSPEC)) ||
 	    (user_mode == FI_MR_BASIC) ||
-	    ((user_mode & prov_mode & OFI_MR_MODE_RMA_TARGET) == 
+	    ((user_mode & prov_mode & OFI_MR_MODE_RMA_TARGET) ==
 	     (prov_mode & OFI_MR_MODE_RMA_TARGET)))
 		return caps;
 
@@ -1197,9 +1243,6 @@ trim_caps:
 void ofi_alter_info(struct fi_info *info, const struct fi_info *hints,
 		    uint32_t api_version)
 {
-	if (!hints)
-		return;
-
 	for (; info; info = info->next) {
 		/* This should stay before call to fi_alter_domain_attr as
 		 * the checks depend on unmodified provider mr_mode attr */
@@ -1211,12 +1254,17 @@ void ofi_alter_info(struct fi_info *info, const struct fi_info *hints,
 		      (hints->domain_attr->mr_mode & (FI_MR_BASIC | FI_MR_SCALABLE)))))
 			info->mode |= FI_LOCAL_MR;
 
-		info->handle = hints->handle;
+		if (hints)
+			info->handle = hints->handle;
 
-		fi_alter_domain_attr(info->domain_attr, hints->domain_attr,
+		fi_alter_domain_attr(info->domain_attr,
+				     hints ? hints->domain_attr : NULL,
 				     info->caps, api_version);
-		fi_alter_ep_attr(info->ep_attr, hints->ep_attr, info->caps);
-		fi_alter_rx_attr(info->rx_attr, hints->rx_attr, info->caps);
-		fi_alter_tx_attr(info->tx_attr, hints->tx_attr, info->caps);
+		fi_alter_ep_attr(info->ep_attr, hints ? hints->ep_attr : NULL,
+				 info->caps);
+		fi_alter_rx_attr(info->rx_attr, hints ? hints->rx_attr : NULL,
+				 info->caps);
+		fi_alter_tx_attr(info->tx_attr, hints ? hints->tx_attr : NULL,
+				 info->caps);
 	}
 }
diff --git a/deps/libfabric/prov/util/src/util_av.c b/deps/libfabric/prov/util/src/util_av.c
index 6d2e7589ecbcb091c00c01ed217699c6c69366f1..2ec86d7d2cb02ad6f7b9dc19ca60d9558fcb7760 100644
--- a/deps/libfabric/prov/util/src/util_av.c
+++ b/deps/libfabric/prov/util/src/util_av.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017 Intel Corporation. All rights reserved.
+ * Copyright (c) 2015-2020 Intel Corporation. All rights reserved.
  * Copyright (c) 2017, Cisco Systems, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -53,7 +53,6 @@
 
 enum {
 	UTIL_NO_ENTRY = -1,
-	UTIL_DEFAULT_AV_SIZE = 1024,
 };
 
 static int fi_get_src_sockaddr(const struct sockaddr *dest_addr, size_t dest_addrlen,
@@ -245,7 +244,7 @@ void *ofi_av_get_addr(struct util_av *av, fi_addr_t fi_addr)
 	struct util_av_entry *entry;
 
 	entry = ofi_bufpool_get_ibuf(av->av_entry_pool, fi_addr);
-	return entry->addr;
+	return entry->data;
 }
 
 int ofi_verify_av_insert(struct util_av *av, uint64_t flags)
@@ -278,13 +277,17 @@ int ofi_av_insert_addr(struct util_av *av, const void *addr, fi_addr_t *fi_addr)
 		return 0;
 	} else {
 		entry = ofi_ibuf_alloc(av->av_entry_pool);
-		if (!entry)
+		if (!entry) {
+			if (fi_addr)
+				*fi_addr = FI_ADDR_NOTAVAIL;
 			return -FI_ENOMEM;
+		}
+
 		if (fi_addr)
 			*fi_addr = ofi_buf_index(entry);
-		memcpy(entry->addr, addr, av->addrlen);
+		memcpy(entry->data, addr, av->addrlen);
 		ofi_atomic_initialize32(&entry->use_cnt, 1);
-		HASH_ADD(hh, av->hash, addr, av->addrlen, entry);
+		HASH_ADD(hh, av->hash, data, av->addrlen, entry);
 	}
 	return 0;
 }
@@ -295,7 +298,7 @@ int ofi_av_elements_iter(struct util_av *av, ofi_av_apply_func apply, void *arg)
 	int ret;
 
 	HASH_ITER(hh, av->hash, av_entry, av_entry_tmp) {
-		ret = apply(av, av_entry->addr,
+		ret = apply(av, av_entry->data,
 			    ofi_buf_index(av_entry), arg);
 		if (OFI_UNLIKELY(ret))
 			return ret;
@@ -424,8 +427,15 @@ static int util_av_init(struct util_av *av, const struct fi_av_attr *attr,
 {
 	int ret = 0;
 	size_t max_count;
+	size_t offset;
+
+	/* offset calculated on a 8-byte boundary */
+	offset = util_attr->addrlen % 8;
+	if (offset != 0)
+		offset = 8 - offset;
 	struct ofi_bufpool_attr pool_attr = {
-		.size		= util_attr->addrlen +
+		.size		= util_attr->addrlen + offset +
+				  util_attr->context_len +
 				  sizeof(struct util_av_entry),
 		.alignment	= 16,
 		.max_cnt	= 0,
@@ -442,19 +452,12 @@ static int util_av_init(struct util_av *av, const struct fi_av_attr *attr,
 	if (ret)
 		return ret;
 
-	if (attr->count) {
-		max_count = attr->count;
-	} else {
-		if (fi_param_get_size_t(NULL, "universe_size", &max_count))
-			max_count = UTIL_DEFAULT_AV_SIZE;
-	}
-
-	av->count = roundup_power_of_two(max_count ?
-					 max_count :
-					 UTIL_DEFAULT_AV_SIZE);
+	max_count = attr->count ? attr->count : ofi_universe_size;
+	av->count = roundup_power_of_two(max_count);
 	FI_INFO(av->prov, FI_LOG_AV, "AV size %zu\n", av->count);
 
 	av->addrlen = util_attr->addrlen;
+	av->context_offset = offset + av->addrlen;
 	av->flags = util_attr->flags | attr->flags;
 	av->hash = NULL;
 
@@ -592,20 +595,18 @@ static int ip_av_insert_addr(struct util_av *av, const void *addr,
 			     fi_addr_t *fi_addr, void *context)
 {
 	int ret;
-	fi_addr_t fi_addr_ret;
 
 	if (ip_av_valid_addr(av, addr)) {
 		fastlock_acquire(&av->lock);
-		ret = ofi_av_insert_addr(av, addr, &fi_addr_ret);
+		ret = ofi_av_insert_addr(av, addr, fi_addr);
 		fastlock_release(&av->lock);
 	} else {
 		ret = -FI_EADDRNOTAVAIL;
+		if (fi_addr)
+			*fi_addr = FI_ADDR_NOTAVAIL;
 		FI_WARN(av->prov, FI_LOG_AV, "invalid address\n");
 	}
 
-	if (fi_addr)
-		*fi_addr = !ret ? fi_addr_ret : FI_ADDR_NOTAVAIL;
-
 	ofi_straddr_dbg(av->prov, FI_LOG_AV, "av_insert addr", addr);
 	if (fi_addr)
 		FI_DBG(av->prov, FI_LOG_AV, "av_insert fi_addr: %" PRIu64 "\n",
@@ -896,7 +897,7 @@ int ofi_ip_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr,
 		container_of(av_fid, struct util_av, av_fid);
 	size_t av_addrlen;
 	void *av_addr = ofi_av_lookup_addr(av, fi_addr, &av_addrlen);
-	
+
 	memcpy(addr, av_addr, MIN(*addrlen, av_addrlen));
 	*addrlen = av->addrlen;
 
@@ -955,6 +956,7 @@ int ofi_ip_av_create_flags(struct fid_domain *domain_fid, struct fi_av_attr *att
 		util_attr.addrlen = sizeof(struct sockaddr_in6);
 
 	util_attr.flags = flags;
+	util_attr.context_len = 0;
 
 	if (attr->type == FI_AV_UNSPEC)
 		attr->type = FI_AV_MAP;
diff --git a/deps/libfabric/prov/util/src/util_buf.c b/deps/libfabric/prov/util/src/util_buf.c
index a62ee547beaf81def8b160442b0369492b0197af..74f10a987b93d89d1fce2dea4fa2f9ae88af49c9 100644
--- a/deps/libfabric/prov/util/src/util_buf.c
+++ b/deps/libfabric/prov/util/src/util_buf.c
@@ -67,15 +67,13 @@ int ofi_bufpool_grow(struct ofi_bufpool *pool)
 		ret = ofi_alloc_hugepage_buf((void **) &buf_region->alloc_region,
 					     pool->alloc_size);
 		/* If we can't allocate huge pages, fall back to normal
-		 * allocations if this is the first allocation attempt.
+		 * allocations for all future attempts.
 		 */
-		if (ret && !pool->entry_cnt) {
+		if (ret) {
 			pool->attr.flags &= ~OFI_BUFPOOL_HUGEPAGES;
-			pool->alloc_size = (pool->attr.chunk_cnt + 1) *
-					   pool->entry_size;
-			pool->region_size = pool->alloc_size - pool->entry_size;
 			goto retry;
 		}
+		buf_region->flags = OFI_BUFPOOL_HUGEPAGES;
 	} else {
 retry:
 		ret = ofi_memalign((void **) &buf_region->alloc_region,
@@ -156,7 +154,7 @@ err3:
 	if (pool->attr.free_fn)
 	    pool->attr.free_fn(buf_region);
 err2:
-	if (pool->attr.flags & OFI_BUFPOOL_HUGEPAGES)
+	if (buf_region->flags & OFI_BUFPOOL_HUGEPAGES)
 		ofi_free_hugepage_buf(buf_region->alloc_region, pool->alloc_size);
 	else
 		ofi_freealign(buf_region->alloc_region);
@@ -221,7 +219,7 @@ void ofi_bufpool_destroy(struct ofi_bufpool *pool)
 		if (pool->attr.free_fn)
 			pool->attr.free_fn(buf_region);
 
-		if (pool->attr.flags & OFI_BUFPOOL_HUGEPAGES) {
+		if (buf_region->flags & OFI_BUFPOOL_HUGEPAGES) {
 			ret = ofi_free_hugepage_buf(buf_region->alloc_region,
 						    pool->alloc_size);
 			if (ret) {
diff --git a/deps/libfabric/prov/util/src/util_coll.c b/deps/libfabric/prov/util/src/util_coll.c
index 64b9a4e7460348a1a540c2de0bcdedbfaafde2da..2322f319b8cfdf8d81bacf1072d8272f15e1e1cf 100644
--- a/deps/libfabric/prov/util/src/util_coll.c
+++ b/deps/libfabric/prov/util/src/util_coll.c
@@ -641,7 +641,7 @@ static int util_coll_scatter(struct util_coll_operation *coll_op, const void *da
 			// according to destination rank. if we're rank 3, data intended for
 			// ranks 0-2 will be moved to the end
 			*temp = malloc(cur_cnt * ofi_datatype_size(datatype));
-			if (!temp)
+			if (!*temp)
 				return -FI_ENOMEM;
 			ret = util_coll_sched_copy(coll_op,
 						   (char *) data + nbytes * local_rank, *temp,
@@ -871,11 +871,10 @@ void util_coll_collective_comp(struct util_coll_operation *coll_op)
 static int util_coll_proc_reduce_item(struct util_coll_reduce_item *reduce_item)
 {
 	if (FI_MIN <= reduce_item->op && FI_BXOR >= reduce_item->op) {
-		ofi_atomic_write_handlers[reduce_item->op]
-					 [reduce_item->datatype](
-						 reduce_item->inout_buf,
-						 reduce_item->in_buf,
-						 reduce_item->count);
+		ofi_atomic_write_handler(reduce_item->op, reduce_item->datatype,
+					 reduce_item->inout_buf,
+					 reduce_item->in_buf,
+					 reduce_item->count);
 	} else {
 		return -FI_ENOSYS;
 	}
diff --git a/deps/libfabric/prov/util/src/util_cq.c b/deps/libfabric/prov/util/src/util_cq.c
index bf0269ad038e1a9e808a5fc7df9382a55e7fb6bd..fe230f525028f1e0267e11ba9582507f0600473e 100644
--- a/deps/libfabric/prov/util/src/util_cq.c
+++ b/deps/libfabric/prov/util/src/util_cq.c
@@ -687,7 +687,7 @@ uint64_t ofi_rx_flags[] = {
 };
 
 uint64_t ofi_tx_flags[] = {
-	[ofi_op_msg] = FI_SEND,
+	[ofi_op_msg] = FI_SEND | FI_MSG,
 	[ofi_op_tagged] = FI_SEND | FI_TAGGED,
 	[ofi_op_read_req] = FI_RMA | FI_READ,
 	[ofi_op_read_rsp] = FI_RMA | FI_READ,
diff --git a/deps/libfabric/prov/util/src/util_eq.c b/deps/libfabric/prov/util/src/util_eq.c
index fde0aabbbf650e89e3bcebb4729e8500ac4aca27..915084642b67e99bdf027b649ceec0ce8f41afb5 100644
--- a/deps/libfabric/prov/util/src/util_eq.c
+++ b/deps/libfabric/prov/util/src/util_eq.c
@@ -41,7 +41,8 @@ void ofi_eq_handle_err_entry(uint32_t api_version, uint64_t flags,
 			     struct fi_eq_err_entry *user_err_entry)
 {
 	if ((FI_VERSION_GE(api_version, FI_VERSION(1, 5)))
-	    && user_err_entry->err_data && user_err_entry->err_data_size) {
+	    && user_err_entry->err_data && user_err_entry->err_data_size
+	    && err_entry->err_data && err_entry->err_data_size) {
 		void *err_data = user_err_entry->err_data;
 		size_t err_data_size = MIN(err_entry->err_data_size,
 					   user_err_entry->err_data_size);
@@ -104,7 +105,7 @@ ssize_t ofi_eq_read(struct fid_eq *eq_fid, uint32_t *event,
 
 			ofi_eq_handle_err_entry(eq->fabric->fabric_fid.api_version,
 						flags, err_entry, buf);
-			ret = (ssize_t) entry->size;
+			ret = entry->size;
 
 			if (!(flags & FI_PEEK))
 				eq->saved_err_data = err_entry->err_data;
@@ -143,7 +144,7 @@ ssize_t ofi_eq_write(struct fid_eq *eq_fid, uint32_t event,
 	if (!entry)
 		return -FI_ENOMEM;
 
-	entry->size = (int) len;
+	entry->size = len;
 	entry->event = event;
 	entry->err = !!(flags & UTIL_FLAG_ERROR);
 	memcpy(entry->data, buf, len);
diff --git a/deps/libfabric/prov/util/src/util_main.c b/deps/libfabric/prov/util/src/util_main.c
index 345c74d73ce4c87442a59b1d52fe65e530ba7ded..6c4e77051fc5113691077b87a0e0780fb9f327de 100644
--- a/deps/libfabric/prov/util/src/util_main.c
+++ b/deps/libfabric/prov/util/src/util_main.c
@@ -283,7 +283,9 @@ static void util_set_netif_names(struct fi_info *info,
  * given fi_info input.
  */
 #if HAVE_GETIFADDRS
-static void util_getinfo_ifs(const struct util_prov *prov, struct fi_info *src_info,
+static void util_getinfo_ifs(const struct util_prov *prov,
+			     const struct fi_info *hints,
+			     struct fi_info *src_info,
 			     struct fi_info **head, struct fi_info **tail)
 {
 	struct fi_info *cur;
@@ -302,6 +304,10 @@ static void util_getinfo_ifs(const struct util_prov *prov, struct fi_info *src_i
 	slist_foreach(&addr_list, entry, prev) {
 		addr_entry = container_of(entry, struct ofi_addr_list_entry, entry);
 
+		if (hints && ((hints->caps & addr_entry->comm_caps) !=
+		    (hints->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM))))
+			continue;
+
 		cur = fi_dupinfo(src_info);
 		if (!cur)
 			break;
@@ -328,6 +334,8 @@ static void util_getinfo_ifs(const struct util_prov *prov, struct fi_info *src_i
 			continue;
 		}
 
+		cur->caps = (cur->caps & ~(FI_LOCAL_COMM | FI_REMOTE_COMM)) |
+			    addr_entry->comm_caps;
 		cur->src_addr = mem_dup(&addr_entry->ipaddr, addrlen);
 		if (cur->src_addr) {
 			cur->src_addrlen = addrlen;
@@ -343,7 +351,9 @@ static void util_getinfo_ifs(const struct util_prov *prov, struct fi_info *src_i
 	}
 }
 #else
-static void util_getinfo_ifs(const struct util_prov *prov, struct fi_info *src_info,
+static void util_getinfo_ifs(const struct util_prov *prov,
+			     const struct fi_info *hints,
+			     struct fi_info *src_info,
 			     struct fi_info **head, struct fi_info **tail)
 {
 	*head = src_info;
@@ -377,7 +387,7 @@ int ofi_ip_getinfo(const struct util_prov *prov, uint32_t version,
 	prev = info;
 	for (cur = *info; cur; cur = cur->next) {
 		if (!cur->src_addr && !cur->dest_addr) {
-			util_getinfo_ifs(prov, cur, &head, &tail);
+			util_getinfo_ifs(prov, hints, cur, &head, &tail);
 			if (head != cur) {
 				tail->next = (*prev)->next;
 				*prev = head;
diff --git a/deps/libfabric/prov/util/src/util_mem_hooks.c b/deps/libfabric/prov/util/src/util_mem_hooks.c
index a8ab68169256adae97c52858acb68079f5488c31..7a61948ad33cae09325723421f1fc7aeddc4816c 100644
--- a/deps/libfabric/prov/util/src/util_mem_hooks.c
+++ b/deps/libfabric/prov/util/src/util_mem_hooks.c
@@ -1,6 +1,22 @@
 /*
- * Copyright (c) 2016 Los Alamos National Security, LLC. All rights reserved.
+ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2005 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2009-2017 Cisco Systems, Inc.  All rights reserved
+ * Copyright (c) 2013-2018 Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * Copyright (c) 2016-2017 Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2016-2020 IBM Corporation.  All rights reserved.
  * Copyright (c) 2019 Intel Corporation, Inc.  All rights reserved.
+ * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  *
  * License text from Open-MPI (www.open-mpi.org/community/license.php)
  *
@@ -41,40 +57,68 @@
  */
 
 #include <ofi_mr.h>
+#include <ofi_mem.h>
 
-struct ofi_memhooks memhooks;
+static int ofi_memhooks_start(struct ofi_mem_monitor *monitor);
+static void ofi_memhooks_stop(struct ofi_mem_monitor *monitor);
+
+struct ofi_memhooks memhooks = {
+	.monitor.iface = FI_HMEM_SYSTEM,
+	.monitor.init = ofi_monitor_init,
+	.monitor.cleanup = ofi_monitor_cleanup,
+	.monitor.start = ofi_memhooks_start,
+	.monitor.stop = ofi_memhooks_stop,
+};
 struct ofi_mem_monitor *memhooks_monitor = &memhooks.monitor;
 
 
-#if defined(__linux__) && defined(HAVE_ELF_H) && defined(HAVE_SYS_AUXV_H)
+/* memhook support checks */
+#if HAVE_MEMHOOKS_MONITOR
 
-#include <elf.h>
-#include <sys/auxv.h>
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <sys/shm.h>
+#include <sys/ipc.h>
 #include <unistd.h>
 #include <dlfcn.h>
 #include <fcntl.h>
 #include <link.h>
 
+#if HAVE_DECL___SYSCALL && defined(HAVE___SYSCALL)
+/* calling __syscall is preferred on some systems when some arguments may be 64-bit. it also
+ * has the benefit of having an off_t return type */
+#define ofi_memhooks_syscall __syscall
+#else
+#define ofi_memhooks_syscall syscall
+#endif
+
+// These op codes used to be in bits/ipc.h but were removed in glibc in 2015
+// with a comment saying they should be defined in internal headers:
+// https://sourceware.org/bugzilla/show_bug.cgi?id=18560
+// and when glibc uses that syscall it seems to do so from its own definitions:
+// https://github.com/bminor/glibc/search?q=IPCOP_shmat&unscoped_q=IPCOP_shmat
+#if (!defined(SYS_shmat) && !defined(IPCOP_shmat))
+#define IPCOP_shmat                21
+#endif
+#if (!defined(SYS_shmdt) && !defined(IPCOP_shmdt))
+#define IPCOP_shmdt                22
+#endif
+
+#define OFI_INTERCEPT_MAX_PATCH 32
 
 struct ofi_intercept {
 	struct dlist_entry 		entry;
 	const char			*symbol;
 	void				*our_func;
+	void				*orig_func;
+	unsigned char			patch_data[OFI_INTERCEPT_MAX_PATCH];
+	unsigned char			patch_orig_data[OFI_INTERCEPT_MAX_PATCH];
+	unsigned			patch_data_size;
 	struct dlist_entry		dl_intercept_list;
 };
 
-struct ofi_dl_intercept {
-	struct dlist_entry 		entry;
-	void 				**dl_func_addr;
-	void				*dl_func;
-};
-
 enum {
-	OFI_INTERCEPT_DLOPEN,
 	OFI_INTERCEPT_MMAP,
 	OFI_INTERCEPT_MUNMAP,
 	OFI_INTERCEPT_MREMAP,
@@ -85,7 +129,6 @@ enum {
 	OFI_INTERCEPT_MAX
 };
 
-static void *ofi_intercept_dlopen(const char *filename, int flag);
 static void *ofi_intercept_mmap(void *start, size_t length,
 				int prot, int flags, int fd, off_t offset);
 static int ofi_intercept_munmap(void *start, size_t length);
@@ -97,8 +140,6 @@ static int ofi_intercept_shmdt(const void *shmaddr);
 static int ofi_intercept_brk(const void *brkaddr);
 
 static struct ofi_intercept intercepts[] = {
-	[OFI_INTERCEPT_DLOPEN] = { .symbol = "dlopen",
-				.our_func = ofi_intercept_dlopen},
 	[OFI_INTERCEPT_MMAP] = { .symbol = "mmap",
 				.our_func = ofi_intercept_mmap},
 	[OFI_INTERCEPT_MUNMAP] = { .symbol = "munmap",
@@ -115,308 +156,295 @@ static struct ofi_intercept intercepts[] = {
 				.our_func = ofi_intercept_brk},
 };
 
-struct ofi_mem_calls {
-	void *(*dlopen) (const char *, int);
-	void *(*mmap)(void *, size_t, int, int, int, off_t);
-	int (*munmap)(void *, size_t);
-	void *(*mremap)(void *old_address, size_t old_size,
-			size_t new_size, int flags, ... /* void *new_address */ );
-	int (*madvise)(void *addr, size_t length, int advice);
-	void *(*shmat)(int shmid, const void *shmaddr, int shmflg);
-	int (*shmdt)(const void *shmaddr);
-	int (*brk)(const void *brkaddr);
-};
-
-static struct ofi_mem_calls real_calls;
-
+#ifdef HAVE___CURBRK
+extern void *__curbrk; /* in libc */
+#endif
 
-static const ElfW(Phdr) *
-ofi_get_phdr_dynamic(const ElfW(Phdr) *phdr, uint16_t phnum, int phent)
+#if HAVE___CLEAR_CACHE
+/*
+ * Used on ARM64 platforms, see https://github.com/open-mpi/ompi/issues/5631
+ */
+static inline void ofi_clear_instruction_cache(uintptr_t address, size_t data_size)
 {
-	uint16_t i;
+	/* do not allow global declaration of compiler intrinsic */
+	void __clear_cache(void* beg, void* end);
 
-	for (i = 0 ; i < phnum; i++) {
-		if (phdr->p_type == PT_DYNAMIC)
-			return phdr;
-		phdr = (ElfW(Phdr)*) ((intptr_t) phdr + phent);
+	__clear_cache ((void *) address, (void *) (address + data_size));
+}
+#else
+static inline void ofi_clear_instruction_cache(uintptr_t address, size_t data_size)
+{
+	size_t i;
+	size_t offset_jump = 16;
+#if defined(__aarch64__)
+	offset_jump = 32;
+#endif
+	/* align the address */
+	address &= ~(offset_jump - 1);
+
+	for (i = 0 ; i < data_size ; i += offset_jump) {
+#if (defined(__x86_64__) || defined(__amd64__))
+		__asm__ volatile("mfence;clflush %0;mfence"::
+				 "m" (*((char*) address + i)));
+#elif defined(__aarch64__)
+		__asm__ volatile ("dc cvau, %0\n\t"
+			  "dsb ish\n\t"
+			  "ic ivau, %0\n\t"
+			  "dsb ish\n\t"
+			  "isb":: "r" (address + i));
+#endif
 	}
-
-	return NULL;
 }
+#endif
 
-static void *ofi_get_dynentry(ElfW(Addr) base, const ElfW(Phdr) *pdyn,
-			      ElfW(Sxword) type)
+static inline int ofi_write_patch(unsigned char *patch_data, void *address,
+				  size_t data_size)
 {
-	ElfW(Dyn) *dyn;
+	long page_size;
+	void *base;
+	void *bound;
+	size_t length;
 
-	for (dyn = (ElfW(Dyn)*) (base + pdyn->p_vaddr); dyn->d_tag; ++dyn) {
-		if (dyn->d_tag == type)
-			return (void *) (uintptr_t) dyn->d_un.d_val;
+	page_size = ofi_get_page_size();
+	if (page_size < 0) {
+		FI_WARN(&core_prov, FI_LOG_MR,
+			"failed to get page size: %s\n", fi_strerror(-page_size));
+		return page_size;
 	}
 
-	return NULL;
-}
+	base = ofi_get_page_start(address, page_size);
+	bound = ofi_get_page_end(address, page_size);
+	length = (uintptr_t) bound - (uintptr_t) base;
 
-#if SIZE_MAX > UINT_MAX
-#define OFI_ELF_R_SYM ELF64_R_SYM
-#else
-#define OFI_ELF_R_SYM ELF32_R_SYM
-#endif
+	if (mprotect(base, length, PROT_EXEC|PROT_READ|PROT_WRITE)) {
+		FI_WARN(&core_prov, FI_LOG_MR,
+			"mprotect to set PROT_WRITE on %p len %lu failed: %s\n",
+			(void *) base, length, strerror(errno));
+		return -errno;
+	}
 
-static void *ofi_dl_func_addr(ElfW(Addr) base, const ElfW(Phdr) *phdr,
-			      int16_t phnum, int phent, const char *symbol)
-{
-	const ElfW(Phdr) *dphdr;
-	ElfW(Rela) *reloc;
-	void *jmprel, *strtab;
-	char *elf_sym;
-	uint32_t relsymidx;
-	ElfW(Sym) *symtab;
-	size_t pltrelsz;
-
-	dphdr = ofi_get_phdr_dynamic(phdr, phnum, phent);
-	jmprel = ofi_get_dynentry(base, dphdr, DT_JMPREL);
-	symtab = (ElfW(Sym) *) ofi_get_dynentry(base, dphdr, DT_SYMTAB);
-	strtab = ofi_get_dynentry (base, dphdr, DT_STRTAB);
-	pltrelsz = (uintptr_t) ofi_get_dynentry(base, dphdr, DT_PLTRELSZ);
-
-	for (reloc = jmprel; (intptr_t) reloc < (intptr_t) jmprel + pltrelsz;
-	     reloc++) {
-		relsymidx = OFI_ELF_R_SYM(reloc->r_info);
-		elf_sym = (char *) strtab + symtab[relsymidx].st_name;
-		if (!strcmp(symbol, elf_sym))
-			return (void *) (base + reloc->r_offset);
-        }
-
-        return NULL;
-}
+	memcpy(address, patch_data, data_size);
 
-static int ofi_intercept_dl_calls(ElfW(Addr) base, const ElfW(Phdr) *phdr,
-				  const char *phname, int16_t phnum, int phent,
-				  struct ofi_intercept *intercept)
-{
-	struct ofi_dl_intercept *dl_entry;
-	long page_size = ofi_get_page_size();
-	void **func_addr, *page;
-	int ret;
+	ofi_clear_instruction_cache((uintptr_t) address, data_size);
 
-	FI_DBG(&core_prov, FI_LOG_MR,
-	       "intercepting symbol %s from dl\n", intercept->symbol);
-	func_addr = ofi_dl_func_addr(base, phdr, phnum, phent, intercept->symbol);
-	if (!func_addr)
-		return FI_SUCCESS;
-
-	page = (void *) ((intptr_t) func_addr & ~(page_size - 1));
-	ret = mprotect(page, page_size, PROT_READ | PROT_WRITE);
-	if (ret < 0)
-		return -FI_ENOSYS;
-
-	if (*func_addr != intercept->our_func) {
-		dl_entry = malloc(sizeof(*dl_entry));
-		if (!dl_entry)
-			return -FI_ENOMEM;
-
-		dl_entry->dl_func_addr = func_addr;
-		dl_entry->dl_func = *func_addr;
-		*func_addr = intercept->our_func;
-		dlist_insert_tail(&dl_entry->entry, &intercept->dl_intercept_list);
-	}
+	/*
+	 * Nothing we can do here if this fails so ignore the return code. It
+	 * shouldn't due to alignment since the parameters are the same as
+	 * before.
+	 */
+	if (mprotect(base, length, PROT_EXEC|PROT_READ))
+		FI_WARN(&core_prov, FI_LOG_MR,
+			"mprotect to drop PROT_WRITE on %p len %lu failed: %s\n",
+			 base, length, strerror(errno));
 
-	return FI_SUCCESS;
+	return 0;
 }
 
-static int ofi_intercept_phdr_handler(struct dl_phdr_info *info,
-                                    size_t size, void *data)
+static int ofi_apply_patch(struct ofi_intercept *intercept)
 {
-	struct ofi_intercept *intercept = data;
-	int phent, ret;
-
-	phent = getauxval(AT_PHENT);
-	if (phent <= 0) {
-		FI_DBG(&core_prov, FI_LOG_MR, "failed to read phent size");
-		return -FI_EINVAL;
-	}
-
-	ret = ofi_intercept_dl_calls(info->dlpi_addr, info->dlpi_phdr,
-				     info->dlpi_name, info->dlpi_phnum,
-				     phent, intercept);
-	return ret;
+	memcpy(intercept->patch_orig_data, intercept->orig_func,
+	       intercept->patch_data_size);
+	return ofi_write_patch(intercept->patch_data, intercept->orig_func,
+			       intercept->patch_data_size);
 }
 
-static void *ofi_intercept_dlopen(const char *filename, int flag)
+static int ofi_remove_patch(struct ofi_intercept *intercept)
 {
-	struct ofi_intercept  *intercept;
-	void *handle;
+	return ofi_write_patch(intercept->patch_orig_data, intercept->orig_func,
+			       intercept->patch_data_size);
+}
 
-	handle = real_calls.dlopen(filename, flag);
-	if (!handle)
-		return NULL;
+static void ofi_restore_intercepts(void)
+{
+	struct ofi_intercept *intercept;
 
-	pthread_mutex_lock(&memhooks_monitor->lock);
 	dlist_foreach_container(&memhooks.intercept_list, struct ofi_intercept,
-		intercept, entry) {
-		dl_iterate_phdr(ofi_intercept_phdr_handler, intercept);
-	}
-	pthread_mutex_unlock(&memhooks_monitor->lock);
-	return handle;
+		intercept, entry)
+		ofi_remove_patch(intercept);
 }
 
-static int ofi_restore_dl_calls(ElfW(Addr) base, const ElfW(Phdr) *phdr,
-				const char *phname, int16_t phnum, int phent,
-				struct ofi_intercept *intercept)
+#if (defined(__x86_64___) || defined(__amd64__))
+static int ofi_patch_function(struct ofi_intercept *intercept)
 {
-	struct ofi_dl_intercept *dl_entry;
-	long page_size = ofi_get_page_size();
-	void **func_addr, *page;
-	int ret;
-
-	FI_DBG(&core_prov, FI_LOG_MR,
-	       "releasing symbol %s from dl\n", intercept->symbol);
-	func_addr = ofi_dl_func_addr(base, phdr, phnum, phent, intercept->symbol);
-	if (!func_addr)
-		return FI_SUCCESS;
-
-	page = (void *) ((intptr_t) func_addr & ~(page_size - 1));
-	ret = mprotect(page, page_size, PROT_READ | PROT_WRITE);
-	if (ret < 0)
-		return -FI_ENOSYS;
-
-	dlist_foreach_container_reverse(&intercept->dl_intercept_list,
-		struct ofi_dl_intercept, dl_entry, entry) {
-
-		if (dl_entry->dl_func_addr != func_addr)
-			continue;
-
-		assert(*func_addr == intercept->our_func);
-		*func_addr = dl_entry->dl_func;
-		dlist_remove(&dl_entry->entry);
-		free(dl_entry);
-		FI_DBG(&core_prov, FI_LOG_MR,
-		       "dl symbol %s restored\n", intercept->symbol);
-		break;
-	}
-
-	return FI_SUCCESS;
+	intercept->patch_data_size = 13;
+	*(unsigned short*)(intercept->patch_data + 0) = 0xbb49;
+	*(unsigned long* )(intercept->patch_data + 2) =
+		(unsigned long) intercept->our_func;
+	*(unsigned char*) (intercept->patch_data +10) = 0x41;
+	*(unsigned char*) (intercept->patch_data +11) = 0xff;
+	*(unsigned char*) (intercept->patch_data +12) = 0xe3;
+
+	return ofi_apply_patch(intercept);
 }
-
-static int ofi_restore_phdr_handler(struct dl_phdr_info *info,
-                                    size_t size, void *data)
+#elif defined(__aarch64__)
+/**
+ * @brief Generate a mov immediate instruction
+ *
+ * @param[in] reg   register number (0-31)
+ * @param[in] shift shift amount (0-3) * 16-bits
+ * @param[in] value immediate value
+ */
+static uint32_t mov(unsigned int reg, uint16_t shift, uint16_t value)
 {
-	struct ofi_intercept *intercept = data;
-	int phent, ret;
-
-	phent = getauxval(AT_PHENT);
-	if (phent <= 0) {
-		FI_DBG(&core_prov, FI_LOG_MR, "failed to read phent size");
-		return -FI_EINVAL;
-	}
+	return (0x1a5 << 23) + ((uint32_t) shift << 21) + ((uint32_t) value << 5) + reg;
+}
 
-	ret = ofi_restore_dl_calls(info->dlpi_addr, info->dlpi_phdr,
-				   info->dlpi_name, info->dlpi_phnum,
-				   phent, intercept);
-	return ret;
+/**
+ * @brief Generate a mov immediate with keep instruction
+ *
+ * @param[in] reg   register number (0-31)
+ * @param[in] shift shift amount (0-3) * 16-bits
+ * @param[in] value immediate value
+ */
+static uint32_t movk(unsigned int reg, uint16_t shift, uint16_t value)
+{
+	return (0x1e5 << 23) + ((uint32_t) shift << 21) + ((uint32_t) value << 5) + reg;
 }
 
-static void ofi_restore_intercepts(void)
+/**
+ * @brief Generate a branch to register instruction
+ *
+ * @param[in] reg   register number (0-31)
+ */
+static uint32_t br(unsigned int reg)
 {
-	struct ofi_intercept *intercept;
+	return (0xd61f << 16) + (reg << 5);
+}
 
-	dlist_foreach_container(&memhooks.intercept_list, struct ofi_intercept,
-		intercept, entry) {
-		dl_iterate_phdr(ofi_restore_phdr_handler, intercept);
-	}
+static int ofi_patch_function(struct ofi_intercept *intercept)
+{
+	/*
+	 * r15 is the highest numbered temporary register. I am
+	 * assuming this one is safe to use.
+	 */
+	const unsigned int gr = 15;
+	uintptr_t addr = (uintptr_t) intercept->patch_data;
+	uintptr_t value = (uintptr_t) intercept->our_func;
+
+	*(uint32_t *) (addr +  0) = mov(gr, 3, value >> 48);
+	*(uint32_t *) (addr +  4) = movk(gr, 2, value >> 32);
+	*(uint32_t *) (addr +  8) = movk(gr, 1, value >> 16);
+	*(uint32_t *) (addr + 12) = movk(gr, 0, value);
+	intercept->patch_data_size = 16;
+
+	*(uint32_t *) ((uintptr_t) intercept->patch_data +
+		       intercept->patch_data_size) = br(gr);
+	intercept->patch_data_size = intercept->patch_data_size + 4;
+
+	return ofi_apply_patch(intercept);
 }
+#endif
 
-static int ofi_intercept_symbol(struct ofi_intercept *intercept, void **real_func)
+/*
+ * This implementation intercepts syscalls by overwriting the beginning of
+ * glibc's functions with a jump to our intercept function. After notifying the
+ * cache we will make the syscall directly. We store the original instructions
+ * and restore them when memhooks is unloaded.
+ */
+static int ofi_intercept_symbol(struct ofi_intercept *intercept)
 {
+	void *func_addr;
 	int ret;
 
 	FI_DBG(&core_prov, FI_LOG_MR,
-	       "intercepting symbol %s\n", intercept->symbol);
-	ret = dl_iterate_phdr(ofi_intercept_phdr_handler, intercept);
-	if (ret)
-		return ret;
-
-	*real_func = dlsym(RTLD_DEFAULT, intercept->symbol);
-	if (*real_func == intercept->our_func) {
-		(void) dlerror();
-		*real_func = dlsym(RTLD_NEXT, intercept->symbol);
+	       "overwriting function %s\n", intercept->symbol);
+
+	func_addr = dlsym(RTLD_NEXT, intercept->symbol);
+	if (!func_addr) {
+		func_addr = dlsym(RTLD_DEFAULT, intercept->symbol);
+		if (!func_addr) {
+			FI_DBG(&core_prov, FI_LOG_MR,
+			       "could not find symbol %s\n", intercept->symbol);
+			ret = -FI_ENOMEM;
+			return ret;
+		}
 	}
 
-	if (!*real_func) {
-		FI_DBG(&core_prov, FI_LOG_MR,
-		       "could not find symbol %s\n", intercept->symbol);
-		ret = -FI_ENOMEM;
-		return ret;
-	}
-	dlist_insert_tail(&intercept->entry, &memhooks.intercept_list);
+	intercept->orig_func = func_addr;
+
+	ret = ofi_patch_function(intercept);
+
+	if (!ret)
+		dlist_insert_tail(&intercept->entry, &memhooks.intercept_list);
 
 	return ret;
 }
 
 void ofi_intercept_handler(const void *addr, size_t len)
 {
-	pthread_mutex_lock(&memhooks_monitor->lock);
+	pthread_rwlock_rdlock(&mm_list_rwlock);
+	pthread_mutex_lock(&mm_lock);
 	ofi_monitor_notify(memhooks_monitor, addr, len);
-	pthread_mutex_unlock(&memhooks_monitor->lock);
+	pthread_mutex_unlock(&mm_lock);
+	pthread_rwlock_unlock(&mm_list_rwlock);
 }
 
 static void *ofi_intercept_mmap(void *start, size_t length,
                             int prot, int flags, int fd, off_t offset)
 {
-	FI_DBG(&core_prov, FI_LOG_MR,
-	       "intercepted mmap start %p len %zu\n", start, length);
-	ofi_intercept_handler(start, length);
+	if ((flags & MAP_FIXED) && start)
+		ofi_intercept_handler(start, length);
 
-	return real_calls.mmap(start, length, prot, flags, fd, offset);
+	return (void *)(intptr_t) ofi_memhooks_syscall(SYS_mmap, start, length,
+						       prot, flags, fd, offset);
 }
 
 static int ofi_intercept_munmap(void *start, size_t length)
 {
-	FI_DBG(&core_prov, FI_LOG_MR,
-	       "intercepted munmap start %p len %zu\n", start, length);
 	ofi_intercept_handler(start, length);
 
-	return real_calls.munmap(start, length);
+	return ofi_memhooks_syscall(SYS_munmap, start, length);
 }
 
 static void *ofi_intercept_mremap(void *old_address, size_t old_size,
 		size_t new_size, int flags, void *new_address)
 {
-	FI_DBG(&core_prov, FI_LOG_MR,
-	       "intercepted mremap old_addr %p old_size %zu\n",
-	       old_address, old_size);
 	ofi_intercept_handler(old_address, old_size);
 
-	return real_calls.mremap(old_address, old_size, new_size, flags,
-				 new_address);
+#ifdef MREMAP_FIXED
+	/*
+	 * new_address is an optional argument. Explicitly set it to NULL
+	 * if it is not applicable.
+	 */
+	if (!(flags & MREMAP_FIXED))
+		new_address = NULL;
+#endif
+
+	return (void *)(intptr_t) ofi_memhooks_syscall(SYS_mremap, old_address,
+						       old_size, new_size,
+						       flags, new_address);
 }
 
 static int ofi_intercept_madvise(void *addr, size_t length, int advice)
 {
-	FI_DBG(&core_prov, FI_LOG_MR,
-	       "intercepted madvise addr %p len %zu\n", addr, length);
-	ofi_intercept_handler(addr, length);
+	if (advice == MADV_DONTNEED ||
+#ifdef MADV_FREE
+	    advice == MADV_FREE ||
+#endif
+#ifdef MADV_REMOVE
+	    advice == MADV_REMOVE ||
+#endif
+	    advice == POSIX_MADV_DONTNEED) {
+		ofi_intercept_handler(addr, length);
+	}
 
-	return real_calls.madvise(addr, length, advice);
+	return ofi_memhooks_syscall(SYS_madvise, addr, length, advice);
 }
 
 static void *ofi_intercept_shmat(int shmid, const void *shmaddr, int shmflg)
 {
 	struct shmid_ds ds;
 	const void *start;
+	void *result;
 	size_t len;
 	int ret;
 
-	FI_DBG(&core_prov, FI_LOG_MR,
-	       "intercepted shmat addr %p\n", shmaddr);
-
-	if (shmflg & SHM_REMAP) {
+	if (shmaddr && (shmflg & SHM_REMAP)) {
 		ret = shmctl(shmid, IPC_STAT, &ds);
 		len = (ret < 0) ? 0 : ds.shm_segsz;
 
 		if (shmflg & SHM_RND) {
-			start = (char *) shmaddr + ((uintptr_t) shmaddr) % SHMLBA;
+			start = (char *) shmaddr - ((uintptr_t) shmaddr) % SHMLBA;
 			len += ((uintptr_t) shmaddr) % SHMLBA;
 		} else {
 			start = shmaddr;
@@ -425,50 +453,92 @@ static void *ofi_intercept_shmat(int shmid, const void *shmaddr, int shmflg)
 		ofi_intercept_handler(start, len);
 	}
 
-	return real_calls.shmat(shmid, shmaddr, shmflg);
+#ifdef SYS_shmat
+	result = (void *) ofi_memhooks_syscall(SYS_shmat, shmid, shmaddr, shmflg);
+#else // IPCOP_shmat
+	unsigned long sysret;
+	sysret = ofi_memhooks_syscall(SYS_ipc, IPCOP_shmat,
+				      shmid, shmflg, &shmaddr, shmaddr);
+	result = (sysret > -(unsigned long)SHMLBA) ? (void *)sysret :
+						     (void *)shmaddr;
+#endif
+	return result;
 }
 
 static int ofi_intercept_shmdt(const void *shmaddr)
 {
-	FI_DBG(&core_prov, FI_LOG_MR,
-	       "intercepted shmdt addr %p\n", shmaddr);
-	/* Overly aggressive, but simple.  Invalidate everything after shmaddr */
+	int ret;
+
+	/*
+	 * Overly aggressive, but simple.  Invalidate everything after shmaddr.
+	 * We could choose to find the shared memory segment size in /proc but
+	 * that seems like a great way to deadlock ourselves.
+	 */
 	ofi_intercept_handler(shmaddr, SIZE_MAX - (uintptr_t) shmaddr);
 
-	return real_calls.shmdt(shmaddr);
+#ifdef SYS_shmdt
+	ret = ofi_memhooks_syscall(SYS_shmdt, shmaddr);
+#else // IPCOP_shmdt
+	ret = ofi_memhooks_syscall(SYS_ipc, IPCOP_shmdt, 0, 0, 0, shmaddr);
+#endif
+	return ret;
 }
 
 static int ofi_intercept_brk(const void *brkaddr)
 {
-	void *old_addr;
-
-	FI_DBG(&core_prov, FI_LOG_MR,
-	      "intercepted brk addr %p\n", brkaddr);
+	void *old_addr, *new_addr;
 
-	old_addr = sbrk (0);
+#ifdef HAVE___CURBRK
+	old_addr = __curbrk;
+#else
+	old_addr = sbrk(0);
+#endif
+	new_addr = (void *) (intptr_t) ofi_memhooks_syscall(SYS_brk, brkaddr);
+
+#ifdef HAVE___CURBRK
+	/*
+	 * Note: if we were using glibc brk/sbrk, their __curbrk would get
+	 * updated, but since we're going straight to the syscall, we have
+	 * to update __curbrk or else glibc won't see it.
+	 */
+	__curbrk = new_addr;
+#endif
 
-	if(brkaddr > old_addr) {
-		ofi_intercept_handler(brkaddr, (intptr_t) brkaddr -
-							  (intptr_t) old_addr);
+	if (new_addr < brkaddr) {
+		errno = ENOMEM;
+		return -1;
+	} else if (new_addr < old_addr) {
+		ofi_intercept_handler(new_addr, (intptr_t) old_addr -
+				      (intptr_t) new_addr);
 	}
 
-	return real_calls.brk(brkaddr);
+	return 0;
 }
 
 static int ofi_memhooks_subscribe(struct ofi_mem_monitor *monitor,
-				 const void *addr, size_t len)
+				  const void *addr, size_t len,
+				  union ofi_mr_hmem_info *hmem_info)
 {
 	/* no-op */
 	return FI_SUCCESS;
 }
 
 static void ofi_memhooks_unsubscribe(struct ofi_mem_monitor *monitor,
-				    const void *addr, size_t len)
+				     const void *addr, size_t len,
+				     union ofi_mr_hmem_info *hmem_info)
+{
+	/* no-op */
+}
+
+static bool ofi_memhooks_valid(struct ofi_mem_monitor *monitor,
+			       const void *addr, size_t len,
+			       union ofi_mr_hmem_info *hmem_info)
 {
 	/* no-op */
+	return true;
 }
 
-int ofi_memhooks_init(void)
+static int ofi_memhooks_start(struct ofi_mem_monitor *monitor)
 {
 	int i, ret;
 
@@ -477,69 +547,55 @@ int ofi_memhooks_init(void)
 
 	memhooks_monitor->subscribe = ofi_memhooks_subscribe;
 	memhooks_monitor->unsubscribe = ofi_memhooks_unsubscribe;
+	memhooks_monitor->valid = ofi_memhooks_valid;
 	dlist_init(&memhooks.intercept_list);
 
 	for (i = 0; i < OFI_INTERCEPT_MAX; ++i)
 		dlist_init(&intercepts[i].dl_intercept_list);
 
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_DLOPEN],
-				   (void **) &real_calls.dlopen);
-	if (ret) {
-		FI_WARN(&core_prov, FI_LOG_MR,
-		       "intercept dlopen failed %d %s\n", ret, fi_strerror(ret));
-		return ret;
-	}
-
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MMAP],
-				   (void **) &real_calls.mmap);
+	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MMAP]);
 	if (ret) {
 		FI_WARN(&core_prov, FI_LOG_MR,
 		       "intercept mmap failed %d %s\n", ret, fi_strerror(ret));
 		return ret;
 	}
 
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MUNMAP],
-				   (void **) &real_calls.munmap);
+	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MUNMAP]);
 	if (ret) {
 		FI_WARN(&core_prov, FI_LOG_MR,
 		       "intercept munmap failed %d %s\n", ret, fi_strerror(ret));
 		return ret;
 	}
 
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MREMAP],
-				   (void **) &real_calls.mremap);
+	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MREMAP]);
 	if (ret) {
 		FI_WARN(&core_prov, FI_LOG_MR,
 		       "intercept mremap failed %d %s\n", ret, fi_strerror(ret));
 		return ret;
 	}
 
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MADVISE],
-				   (void **) &real_calls.madvise);
+	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MADVISE]);
 	if (ret) {
 		FI_WARN(&core_prov, FI_LOG_MR,
 		       "intercept madvise failed %d %s\n", ret, fi_strerror(ret));
 		return ret;
 	}
 
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_SHMAT],
-				   (void **) &real_calls.shmat);
+	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_SHMAT]);
 	if (ret) {
 		FI_WARN(&core_prov, FI_LOG_MR,
 		       "intercept shmat failed %d %s\n", ret, fi_strerror(ret));
 		return ret;
 	}
 
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_SHMDT],
-				   (void **) &real_calls.shmdt);
+	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_SHMDT]);
 	if (ret) {
 		FI_WARN(&core_prov, FI_LOG_MR,
 		       "intercept shmdt failed %d %s\n", ret, fi_strerror(ret));
 		return ret;
 	}
 
-	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_BRK],
-				   (void **) &real_calls.brk);
+	ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_BRK]);
 	if (ret) {
 		FI_WARN(&core_prov, FI_LOG_MR,
 		       "intercept brk failed %d %s\n", ret, fi_strerror(ret));
@@ -549,7 +605,7 @@ int ofi_memhooks_init(void)
 	return 0;
 }
 
-void ofi_memhooks_cleanup(void)
+static void ofi_memhooks_stop(struct ofi_mem_monitor *monitor)
 {
 	ofi_restore_intercepts();
 	memhooks_monitor->subscribe = NULL;
@@ -558,13 +614,13 @@ void ofi_memhooks_cleanup(void)
 
 #else
 
-int ofi_memhooks_init(void)
+static int ofi_memhooks_start(struct ofi_mem_monitor *monitor)
 {
 	return -FI_ENOSYS;
 }
 
-void ofi_memhooks_cleanup(void)
+static void ofi_memhooks_stop(struct ofi_mem_monitor *monitor)
 {
 }
 
-#endif
+#endif /* memhook support checks */
diff --git a/deps/libfabric/prov/util/src/util_mem_monitor.c b/deps/libfabric/prov/util/src/util_mem_monitor.c
index b9f0ac2608e24432966c028bee60f3d1304a1bac..8acafe5722cc168a5e8cda9551587cd95cb7699d 100644
--- a/deps/libfabric/prov/util/src/util_mem_monitor.c
+++ b/deps/libfabric/prov/util/src/util_mem_monitor.c
@@ -1,7 +1,9 @@
 /*
  * Copyright (c) 2017 Cray Inc. All rights reserved.
  * Copyright (c) 2017-2019 Intel Inc. All rights reserved.
- * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates.
+ *                         All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -35,11 +37,24 @@
 #include <ofi_mr.h>
 #include <unistd.h>
 
-static struct ofi_uffd uffd;
+pthread_mutex_t mm_lock = PTHREAD_MUTEX_INITIALIZER;
+pthread_rwlock_t mm_list_rwlock = PTHREAD_RWLOCK_INITIALIZER;
+
+static int ofi_uffd_start(struct ofi_mem_monitor *monitor);
+static void ofi_uffd_stop(struct ofi_mem_monitor *monitor);
+
+static struct ofi_uffd uffd = {
+	.monitor.iface = FI_HMEM_SYSTEM,
+	.monitor.init = ofi_monitor_init,
+	.monitor.cleanup = ofi_monitor_cleanup,
+	.monitor.start = ofi_uffd_start,
+	.monitor.stop = ofi_uffd_stop,
+};
 struct ofi_mem_monitor *uffd_monitor = &uffd.monitor;
 
 struct ofi_mem_monitor *default_monitor;
-
+struct ofi_mem_monitor *default_cuda_monitor;
+struct ofi_mem_monitor *default_rocr_monitor;
 
 static size_t ofi_default_cache_size(void)
 {
@@ -57,20 +72,30 @@ static size_t ofi_default_cache_size(void)
 	return cache_size;
 }
 
+
+void ofi_monitor_init(struct ofi_mem_monitor *monitor)
+{
+	dlist_init(&monitor->list);
+}
+
+void ofi_monitor_cleanup(struct ofi_mem_monitor *monitor)
+{
+	assert(dlist_empty(&monitor->list));
+}
+
 /*
  * Initialize all available memory monitors
  */
-void ofi_monitor_init(void)
+void ofi_monitors_init(void)
 {
-	pthread_mutex_init(&uffd_monitor->lock, NULL);
-	dlist_init(&uffd_monitor->list);
-
-	pthread_mutex_init(&memhooks_monitor->lock, NULL);
-	dlist_init(&memhooks_monitor->list);
+	uffd_monitor->init(uffd_monitor);
+	memhooks_monitor->init(memhooks_monitor);
+	cuda_monitor->init(cuda_monitor);
+	rocr_monitor->init(rocr_monitor);
 
-#if defined(HAVE_ELF_H) && defined(HAVE_SYS_AUXV_H)
+#if HAVE_MEMHOOKS_MONITOR
         default_monitor = memhooks_monitor;
-#elif HAVE_UFFD_UNMAP
+#elif HAVE_UFFD_MONITOR
         default_monitor = uffd_monitor;
 #else
         default_monitor = NULL;
@@ -97,100 +122,207 @@ void ofi_monitor_init(void)
 			" and free calls.  Userfaultfd is the default if"
 			" available on the system. 'disabled' option disables"
 			" memory caching.");
+	fi_param_define(NULL, "mr_cuda_cache_monitor_enabled", FI_PARAM_BOOL,
+			"Enable or disable the CUDA cache memory monitor."
+			"Monitor is enabled by default.");
+	fi_param_define(NULL, "mr_rocr_cache_monitor_enabled", FI_PARAM_BOOL,
+			"Enable or disable the ROCR cache memory monitor. "
+			"Monitor is enabled by default.");
 
 	fi_param_get_size_t(NULL, "mr_cache_max_size", &cache_params.max_size);
 	fi_param_get_size_t(NULL, "mr_cache_max_count", &cache_params.max_cnt);
 	fi_param_get_str(NULL, "mr_cache_monitor", &cache_params.monitor);
+	fi_param_get_bool(NULL, "mr_cuda_cache_monitor_enabled",
+			  &cache_params.cuda_monitor_enabled);
+	fi_param_get_bool(NULL, "mr_rocr_cache_monitor_enabled",
+			  &cache_params.rocr_monitor_enabled);
 
 	if (!cache_params.max_size)
 		cache_params.max_size = ofi_default_cache_size();
 
 	if (cache_params.monitor != NULL) {
-		if (!strcmp(cache_params.monitor, "userfaultfd") &&
-		    default_monitor == uffd_monitor)
+		if (!strcmp(cache_params.monitor, "userfaultfd")) {
+#if HAVE_UFFD_MONITOR
 			default_monitor = uffd_monitor;
-		else if (!strcmp(cache_params.monitor, "memhooks"))
+#else
+			FI_WARN(&core_prov, FI_LOG_MR, "userfaultfd monitor not available\n");
+			default_monitor = NULL;
+#endif
+		} else if (!strcmp(cache_params.monitor, "memhooks")) {
+#if HAVE_MEMHOOKS_MONITOR
 			default_monitor = memhooks_monitor;
-		else if (!strcmp(cache_params.monitor, "disabled"))
+#else
+			FI_WARN(&core_prov, FI_LOG_MR, "memhooks monitor not available\n");
+			default_monitor = NULL;
+#endif
+		} else if (!strcmp(cache_params.monitor, "disabled")) {
 			default_monitor = NULL;
+		}
 	}
+
+	if (cache_params.cuda_monitor_enabled)
+		default_cuda_monitor = cuda_monitor;
+	else
+		default_cuda_monitor = NULL;
+
+	if (cache_params.rocr_monitor_enabled)
+		default_rocr_monitor = rocr_monitor;
+	else
+		default_rocr_monitor = NULL;
 }
 
-void ofi_monitor_cleanup(void)
+void ofi_monitors_cleanup(void)
 {
-	assert(dlist_empty(&uffd_monitor->list));
-	pthread_mutex_destroy(&uffd_monitor->lock);
-
-	assert(dlist_empty(&memhooks_monitor->list));
-	pthread_mutex_destroy(&memhooks_monitor->lock);
+	uffd_monitor->cleanup(uffd_monitor);
+	memhooks_monitor->cleanup(memhooks_monitor);
+	cuda_monitor->cleanup(cuda_monitor);
+	rocr_monitor->cleanup(rocr_monitor);
 }
 
-int ofi_monitor_add_cache(struct ofi_mem_monitor *monitor,
-			  struct ofi_mr_cache *cache)
+/* Monitors array must be of size OFI_HMEM_MAX. */
+int ofi_monitors_add_cache(struct ofi_mem_monitor **monitors,
+			   struct ofi_mr_cache *cache)
 {
 	int ret = 0;
+	enum fi_hmem_iface iface;
+	struct ofi_mem_monitor *monitor;
+	unsigned int success_count = 0;
 
-	if (!monitor)
+	if (!monitors) {
+		for (iface = FI_HMEM_SYSTEM; iface < OFI_HMEM_MAX; iface++)
+			cache->monitors[iface] = NULL;
 		return -FI_ENOSYS;
+	}
+
+	/* Loops until there are no readers or writers holding the lock */
+	do {
+		ret = pthread_rwlock_trywrlock(&mm_list_rwlock);
+		if (ret && ret != EBUSY) {
+			FI_WARN(&core_prov, FI_LOG_MR,
+				"add_cache cannot obtain write lock, %d\n",
+				ret);
+			return ret;
+		}
+	} while (ret);
+
+	for (iface = FI_HMEM_SYSTEM; iface < OFI_HMEM_MAX; iface++) {
+		cache->monitors[iface] = NULL;
+
+		monitor = monitors[iface];
+		if (!monitor) {
+			FI_DBG(&core_prov, FI_LOG_MR,
+			       "MR cache disabled for %s memory\n",
+			       fi_tostr(&iface, FI_TYPE_HMEM_IFACE));
+			continue;
+		}
+
+		if (dlist_empty(&monitor->list)) {
+			ret = monitor->start(monitor);
+			if (ret == -FI_ENOSYS)
+				continue;
+			else if (ret)
+				goto err;
+		}
 
-	pthread_mutex_lock(&monitor->lock);
-	if (dlist_empty(&monitor->list)) {
-		if (monitor == uffd_monitor)
-			ret = ofi_uffd_init();
-		else if (monitor == memhooks_monitor)
-			ret = ofi_memhooks_init();
-		else
-			ret = -FI_ENOSYS;
-
-		if (ret)
-			goto out;
+		success_count++;
+		cache->monitors[iface] = monitor;
+		dlist_insert_tail(&cache->notify_entries[iface],
+				  &monitor->list);
 	}
-	cache->monitor = monitor;
-	dlist_insert_tail(&cache->notify_entry, &monitor->list);
-out:
-	pthread_mutex_unlock(&monitor->lock);
+	pthread_rwlock_unlock(&mm_list_rwlock);
+	return success_count ? FI_SUCCESS : -FI_ENOSYS;
+
+err:
+	pthread_rwlock_unlock(&mm_list_rwlock);
+
+	FI_WARN(&core_prov, FI_LOG_MR,
+		"Failed to start %s memory monitor: %s\n",
+		fi_tostr(&iface, FI_TYPE_HMEM_IFACE), fi_strerror(-ret));
+	ofi_monitors_del_cache(cache);
+
 	return ret;
 }
 
-void ofi_monitor_del_cache(struct ofi_mr_cache *cache)
+void ofi_monitors_del_cache(struct ofi_mr_cache *cache)
 {
-	struct ofi_mem_monitor *monitor = cache->monitor;
+	struct ofi_mem_monitor *monitor;
+	enum fi_hmem_iface iface;
+	int ret;
+
+	/* Loops until there are no readers or writers holding the lock */
+	do {
+		ret = pthread_rwlock_trywrlock(&mm_list_rwlock);
+		if (ret && ret != EBUSY) {
+			FI_WARN(&core_prov, FI_LOG_MR,
+				"del_cache cannot obtain write lock, %d\n",
+				ret);
+			return;
+		}
+	} while (ret);
+
+	for (iface = 0; iface < OFI_HMEM_MAX; iface++) {
+		monitor = cache->monitors[iface];
+		if (!monitor)
+			continue;
+
+		dlist_remove(&cache->notify_entries[iface]);
 
-	assert(monitor);
-	pthread_mutex_lock(&monitor->lock);
-	dlist_remove(&cache->notify_entry);
+		if (dlist_empty(&monitor->list))
+			monitor->stop(monitor);
 
-	if (dlist_empty(&monitor->list)) {
-		if (monitor == uffd_monitor)
-			ofi_uffd_cleanup();
-		else if (monitor == memhooks_monitor)
-			ofi_memhooks_cleanup();
+		cache->monitors[iface] = NULL;
 	}
 
-	pthread_mutex_unlock(&monitor->lock);
+	pthread_rwlock_unlock(&mm_list_rwlock);
 }
 
-/* Must be called holding monitor lock */
+/* Must be called with locks in place like following
+ *	pthread_rwlock_rdlock(&mm_list_rwlock);
+ *	pthread_mutex_lock(&mm_lock);
+ *	ofi_monitor_notify();
+ *	pthread_mutex_unlock(&mm_lock);
+ *	pthread_rwlock_unlock(&mm_list_rwlock);
+ */
 void ofi_monitor_notify(struct ofi_mem_monitor *monitor,
 			const void *addr, size_t len)
 {
 	struct ofi_mr_cache *cache;
 
 	dlist_foreach_container(&monitor->list, struct ofi_mr_cache,
-				cache, notify_entry) {
+				cache, notify_entries[monitor->iface]) {
 		ofi_mr_cache_notify(cache, addr, len);
 	}
 }
 
+/* Must be called with locks in place like following
+ *	pthread_rwlock_rdlock(&mm_list_rwlock);
+ *	pthread_mutex_lock(&mm_lock);
+ *	ofi_monitor_flush();
+ *	pthread_mutex_unlock(&mm_lock);
+ *	pthread_rwlock_unlock(&mm_list_rwlock);
+ */
+void ofi_monitor_flush(struct ofi_mem_monitor *monitor)
+{
+	struct ofi_mr_cache *cache;
+
+	dlist_foreach_container(&monitor->list, struct ofi_mr_cache,
+				cache, notify_entries[monitor->iface]) {
+		pthread_mutex_unlock(&mm_lock);
+		ofi_mr_cache_flush(cache, false);
+		pthread_mutex_lock(&mm_lock);
+	}
+}
+
 int ofi_monitor_subscribe(struct ofi_mem_monitor *monitor,
-			  const void *addr, size_t len)
+			  const void *addr, size_t len,
+			  union ofi_mr_hmem_info *hmem_info)
 {
 	int ret;
 
 	FI_DBG(&core_prov, FI_LOG_MR,
 	       "subscribing addr=%p len=%zu\n", addr, len);
 
-	ret = monitor->subscribe(monitor, addr, len);
+	ret = monitor->subscribe(monitor, addr, len, hmem_info);
 	if (OFI_UNLIKELY(ret)) {
 		FI_WARN(&core_prov, FI_LOG_MR,
 			"Failed (ret = %d) to monitor addr=%p len=%zu\n",
@@ -200,14 +332,15 @@ int ofi_monitor_subscribe(struct ofi_mem_monitor *monitor,
 }
 
 void ofi_monitor_unsubscribe(struct ofi_mem_monitor *monitor,
-			     const void *addr, size_t len)
+			     const void *addr, size_t len,
+			     union ofi_mr_hmem_info *hmem_info)
 {
 	FI_DBG(&core_prov, FI_LOG_MR,
 	       "unsubscribing addr=%p len=%zu\n", addr, len);
-	monitor->unsubscribe(monitor, addr, len);
+	monitor->unsubscribe(monitor, addr, len, hmem_info);
 }
 
-#if HAVE_UFFD_UNMAP
+#if HAVE_UFFD_MONITOR
 
 #include <poll.h>
 #include <sys/syscall.h>
@@ -228,10 +361,12 @@ static void *ofi_uffd_handler(void *arg)
 		if (ret != 1)
 			break;
 
-		pthread_mutex_lock(&uffd.monitor.lock);
+		pthread_rwlock_rdlock(&mm_list_rwlock);
+		pthread_mutex_lock(&mm_lock);
 		ret = read(uffd.fd, &msg, sizeof(msg));
 		if (ret != sizeof(msg)) {
-			pthread_mutex_unlock(&uffd.monitor.lock);
+			pthread_mutex_unlock(&mm_lock);
+			pthread_rwlock_unlock(&mm_list_rwlock);
 			if (errno != EAGAIN)
 				break;
 			continue;
@@ -242,7 +377,7 @@ static void *ofi_uffd_handler(void *arg)
 			ofi_monitor_unsubscribe(&uffd.monitor,
 				(void *) (uintptr_t) msg.arg.remove.start,
 				(size_t) (msg.arg.remove.end -
-					  msg.arg.remove.start));
+					  msg.arg.remove.start), NULL);
 			/* fall through */
 		case UFFD_EVENT_UNMAP:
 			ofi_monitor_notify(&uffd.monitor,
@@ -260,7 +395,8 @@ static void *ofi_uffd_handler(void *arg)
 				"Unhandled uffd event %d\n", msg.event);
 			break;
 		}
-		pthread_mutex_unlock(&uffd.monitor.lock);
+		pthread_mutex_unlock(&mm_lock);
+		pthread_rwlock_unlock(&mm_list_rwlock);
 	}
 	return NULL;
 }
@@ -286,7 +422,8 @@ static int ofi_uffd_register(const void *addr, size_t len, size_t page_size)
 }
 
 static int ofi_uffd_subscribe(struct ofi_mem_monitor *monitor,
-			      const void *addr, size_t len)
+			      const void *addr, size_t len,
+			      union ofi_mr_hmem_info *hmem_info)
 {
 	int i;
 
@@ -319,7 +456,8 @@ static int ofi_uffd_unregister(const void *addr, size_t len, size_t page_size)
 
 /* May be called from mr cache notifier callback */
 static void ofi_uffd_unsubscribe(struct ofi_mem_monitor *monitor,
-				 const void *addr, size_t len)
+				 const void *addr, size_t len,
+				 union ofi_mr_hmem_info *hmem_info)
 {
 	int i;
 
@@ -330,13 +468,21 @@ static void ofi_uffd_unsubscribe(struct ofi_mem_monitor *monitor,
 	}
 }
 
-int ofi_uffd_init(void)
+static bool ofi_uffd_valid(struct ofi_mem_monitor *monitor, const void *addr,
+			   size_t len, union ofi_mr_hmem_info *hmem_info)
+{
+	/* no-op */
+	return true;
+}
+
+static int ofi_uffd_start(struct ofi_mem_monitor *monitor)
 {
 	struct uffdio_api api;
 	int ret;
 
 	uffd.monitor.subscribe = ofi_uffd_subscribe;
 	uffd.monitor.unsubscribe = ofi_uffd_unsubscribe;
+	uffd.monitor.valid = ofi_uffd_valid;
 
 	if (!num_page_sizes)
 		return -FI_ENODATA;
@@ -379,22 +525,22 @@ closefd:
 	return ret;
 }
 
-void ofi_uffd_cleanup(void)
+static void ofi_uffd_stop(struct ofi_mem_monitor *monitor)
 {
 	pthread_cancel(uffd.thread);
 	pthread_join(uffd.thread, NULL);
 	close(uffd.fd);
 }
 
-#else /* HAVE_UFFD_UNMAP */
+#else /* HAVE_UFFD_MONITOR */
 
-int ofi_uffd_init(void)
+static int ofi_uffd_start(struct ofi_mem_monitor *monitor)
 {
 	return -FI_ENOSYS;
 }
 
-void ofi_uffd_cleanup(void)
+static void ofi_uffd_stop(struct ofi_mem_monitor *monitor)
 {
 }
 
-#endif /* HAVE_UFFD_UNMAP */
+#endif /* HAVE_UFFD_MONITOR */
diff --git a/deps/libfabric/prov/util/src/util_mr_cache.c b/deps/libfabric/prov/util/src/util_mr_cache.c
index c886563b8ccac1957e9902720e353be0f4429ea9..eca00802762307364210dd35ebf002924fc7d841 100644
--- a/deps/libfabric/prov/util/src/util_mr_cache.c
+++ b/deps/libfabric/prov/util/src/util_mr_cache.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2017-2019 Intel Corporation, Inc.  All rights reserved.
  * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
  * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -44,6 +45,8 @@
 
 struct ofi_mr_cache_params cache_params = {
 	.max_cnt = 1024,
+	.cuda_monitor_enabled = true,
+	.rocr_monitor_enabled = true,
 };
 
 static int util_mr_find_within(struct ofi_rbmap *map, void *key, void *data)
@@ -150,24 +153,24 @@ void ofi_mr_cache_notify(struct ofi_mr_cache *cache, const void *addr, size_t le
 		util_mr_uncache_entry(cache, entry);
 }
 
-bool ofi_mr_cache_flush(struct ofi_mr_cache *cache)
+bool ofi_mr_cache_flush(struct ofi_mr_cache *cache, bool flush_lru)
 {
 	struct ofi_mr_entry *entry;
 
-	pthread_mutex_lock(&cache->monitor->lock);
+	pthread_mutex_lock(&mm_lock);
 	while (!dlist_empty(&cache->flush_list)) {
 		dlist_pop_front(&cache->flush_list, struct ofi_mr_entry,
 				entry, list_entry);
 		FI_DBG(cache->domain->prov, FI_LOG_MR, "flush %p (len: %zu)\n",
 		       entry->info.iov.iov_base, entry->info.iov.iov_len);
-		pthread_mutex_unlock(&cache->monitor->lock);
+		pthread_mutex_unlock(&mm_lock);
 
 		util_mr_free_entry(cache, entry);
-		pthread_mutex_lock(&cache->monitor->lock);
+		pthread_mutex_lock(&mm_lock);
 	}
 
-	if (dlist_empty(&cache->lru_list)) {
-		pthread_mutex_unlock(&cache->monitor->lock);
+	if (!flush_lru || dlist_empty(&cache->lru_list)) {
+		pthread_mutex_unlock(&mm_lock);
 		return false;
 	}
 
@@ -179,15 +182,15 @@ bool ofi_mr_cache_flush(struct ofi_mr_cache *cache)
 		       entry->info.iov.iov_base, entry->info.iov.iov_len);
 
 		util_mr_uncache_entry_storage(cache, entry);
-		pthread_mutex_unlock(&cache->monitor->lock);
+		pthread_mutex_unlock(&mm_lock);
 
 		util_mr_free_entry(cache, entry);
-		pthread_mutex_lock(&cache->monitor->lock);
+		pthread_mutex_lock(&mm_lock);
 
 	} while (!dlist_empty(&cache->lru_list) &&
 		 ((cache->cached_cnt >= cache_params.max_cnt) ||
 		  (cache->cached_size >= cache_params.max_size)));
-	pthread_mutex_unlock(&cache->monitor->lock);
+	pthread_mutex_unlock(&mm_lock);
 
 	return true;
 }
@@ -197,20 +200,20 @@ void ofi_mr_cache_delete(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry)
 	FI_DBG(cache->domain->prov, FI_LOG_MR, "delete %p (len: %zu)\n",
 	       entry->info.iov.iov_base, entry->info.iov.iov_len);
 
-	pthread_mutex_lock(&cache->monitor->lock);
+	pthread_mutex_lock(&mm_lock);
 	cache->delete_cnt++;
 
 	if (--entry->use_cnt == 0) {
 		if (!entry->storage_context) {
 			cache->uncached_cnt--;
 			cache->uncached_size -= entry->info.iov.iov_len;
-			pthread_mutex_unlock(&cache->monitor->lock);
+			pthread_mutex_unlock(&mm_lock);
 			util_mr_free_entry(cache, entry);
 			return;
 		}
 		dlist_insert_tail(&entry->list_entry, &cache->lru_list);
 	}
-	pthread_mutex_unlock(&cache->monitor->lock);
+	pthread_mutex_unlock(&mm_lock);
 }
 
 /*
@@ -229,6 +232,9 @@ util_mr_cache_create(struct ofi_mr_cache *cache, const struct ofi_mr_info *info,
 {
 	struct ofi_mr_entry *cur;
 	int ret;
+	struct ofi_mem_monitor *monitor = cache->monitors[info->iface];
+
+	assert(monitor);
 
 	FI_DBG(cache->domain->prov, FI_LOG_MR, "create %p (len: %zu)\n",
 	       info->iov.iov_base, info->iov.iov_len);
@@ -245,7 +251,7 @@ util_mr_cache_create(struct ofi_mr_cache *cache, const struct ofi_mr_info *info,
 	if (ret)
 		goto free;
 
-	pthread_mutex_lock(&cache->monitor->lock);
+	pthread_mutex_lock(&mm_lock);
 	cur = cache->storage.find(&cache->storage, info);
 	if (cur) {
 		ret = -FI_EAGAIN;
@@ -265,21 +271,20 @@ util_mr_cache_create(struct ofi_mr_cache *cache, const struct ofi_mr_info *info,
 		cache->cached_cnt++;
 		cache->cached_size += info->iov.iov_len;
 
-		ret = ofi_monitor_subscribe(cache->monitor, info->iov.iov_base,
-					    info->iov.iov_len);
+		ret = ofi_monitor_subscribe(monitor, info->iov.iov_base,
+					    info->iov.iov_len,
+					    &(*entry)->hmem_info);
 		if (ret) {
 			util_mr_uncache_entry_storage(cache, *entry);
 			cache->uncached_cnt++;
 			cache->uncached_size += (*entry)->info.iov.iov_len;
-		} else {
-			(*entry)->subscribed = 1;
 		}
 	}
-	pthread_mutex_unlock(&cache->monitor->lock);
+	pthread_mutex_unlock(&mm_lock);
 	return 0;
 
 unlock:
-	pthread_mutex_unlock(&cache->monitor->lock);
+	pthread_mutex_unlock(&mm_lock);
 free:
 	util_mr_free_entry(cache, *entry);
 	return ret;
@@ -290,40 +295,54 @@ int ofi_mr_cache_search(struct ofi_mr_cache *cache, const struct fi_mr_attr *att
 {
 	struct ofi_mr_info info;
 	int ret;
+	struct ofi_mem_monitor *monitor = cache->monitors[attr->iface];
+
+	if (!monitor) {
+		FI_WARN(&core_prov, FI_LOG_MR,
+			"MR cache disabled for %s memory\n",
+			fi_tostr(&attr->iface, FI_TYPE_HMEM_IFACE));
+		return -FI_ENOSYS;
+	}
 
 	assert(attr->iov_count == 1);
 	FI_DBG(cache->domain->prov, FI_LOG_MR, "search %p (len: %zu)\n",
 	       attr->mr_iov->iov_base, attr->mr_iov->iov_len);
 
 	info.iov = *attr->mr_iov;
+	info.iface = attr->iface;
+	info.device = attr->device.reserved;
 
 	do {
-		pthread_mutex_lock(&cache->monitor->lock);
+		pthread_mutex_lock(&mm_lock);
 
 		if ((cache->cached_cnt >= cache_params.max_cnt) ||
 		    (cache->cached_size >= cache_params.max_size)) {
-			pthread_mutex_unlock(&cache->monitor->lock);
-			ofi_mr_cache_flush(cache);
-			pthread_mutex_lock(&cache->monitor->lock);
+			pthread_mutex_unlock(&mm_lock);
+			ofi_mr_cache_flush(cache, true);
+			pthread_mutex_lock(&mm_lock);
 		}
 
 		cache->search_cnt++;
 		*entry = cache->storage.find(&cache->storage, &info);
-		if (*entry && ofi_iov_within(attr->mr_iov, &(*entry)->info.iov))
+
+		if (*entry &&
+		    ofi_iov_within(attr->mr_iov, &(*entry)->info.iov) &&
+		    monitor->valid(monitor,
+				   (const void *)(*entry)->info.iov.iov_base,
+				   (*entry)->info.iov.iov_len,
+				   &(*entry)->hmem_info))
 			goto hit;
 
 		/* Purge regions that overlap with new region */
 		while (*entry) {
-			/* New entry will expand range of subscription */
-			(*entry)->subscribed = 0;
 			util_mr_uncache_entry(cache, *entry);
 			*entry = cache->storage.find(&cache->storage, &info);
 		}
-		pthread_mutex_unlock(&cache->monitor->lock);
+		pthread_mutex_unlock(&mm_lock);
 
 		ret = util_mr_cache_create(cache, &info, entry);
 		if (ret && ret != -FI_EAGAIN) {
-			if (ofi_mr_cache_flush(cache))
+			if (ofi_mr_cache_flush(cache, true))
 				ret = -FI_EAGAIN;
 		}
 	} while (ret == -FI_EAGAIN);
@@ -334,7 +353,7 @@ hit:
 	cache->hit_cnt++;
 	if ((*entry)->use_cnt++ == 0)
 		dlist_remove_init(&(*entry)->list_entry);
-	pthread_mutex_unlock(&cache->monitor->lock);
+	pthread_mutex_unlock(&mm_lock);
 	return 0;
 }
 
@@ -348,7 +367,7 @@ struct ofi_mr_entry *ofi_mr_cache_find(struct ofi_mr_cache *cache,
 	FI_DBG(cache->domain->prov, FI_LOG_MR, "find %p (len: %zu)\n",
 	       attr->mr_iov->iov_base, attr->mr_iov->iov_len);
 
-	pthread_mutex_lock(&cache->monitor->lock);
+	pthread_mutex_lock(&mm_lock);
 	cache->search_cnt++;
 
 	info.iov = *attr->mr_iov;
@@ -367,7 +386,7 @@ struct ofi_mr_entry *ofi_mr_cache_find(struct ofi_mr_cache *cache,
 		dlist_remove_init(&(entry)->list_entry);
 
 unlock:
-	pthread_mutex_unlock(&cache->monitor->lock);
+	pthread_mutex_unlock(&mm_lock);
 	return entry;
 }
 
@@ -384,10 +403,10 @@ int ofi_mr_cache_reg(struct ofi_mr_cache *cache, const struct fi_mr_attr *attr,
 	if (!*entry)
 		return -FI_ENOMEM;
 
-	pthread_mutex_lock(&cache->monitor->lock);
+	pthread_mutex_lock(&mm_lock);
 	cache->uncached_cnt++;
 	cache->uncached_size += attr->mr_iov->iov_len;
-	pthread_mutex_unlock(&cache->monitor->lock);
+	pthread_mutex_unlock(&mm_lock);
 
 	(*entry)->info.iov = *attr->mr_iov;
 	(*entry)->use_cnt = 1;
@@ -401,10 +420,10 @@ int ofi_mr_cache_reg(struct ofi_mr_cache *cache, const struct fi_mr_attr *attr,
 
 buf_free:
 	util_mr_entry_free(cache, *entry);
-	pthread_mutex_lock(&cache->monitor->lock);
+	pthread_mutex_lock(&mm_lock);
 	cache->uncached_cnt--;
 	cache->uncached_size -= attr->mr_iov->iov_len;
-	pthread_mutex_unlock(&cache->monitor->lock);
+	pthread_mutex_unlock(&mm_lock);
 	return ret;
 }
 
@@ -419,11 +438,11 @@ void ofi_mr_cache_cleanup(struct ofi_mr_cache *cache)
 		cache->search_cnt, cache->delete_cnt, cache->hit_cnt,
 		cache->notify_cnt);
 
-	while (ofi_mr_cache_flush(cache))
+	while (ofi_mr_cache_flush(cache, true))
 		;
 
 	pthread_mutex_destroy(&cache->lock);
-	ofi_monitor_del_cache(cache);
+	ofi_monitors_del_cache(cache);
 	cache->storage.destroy(&cache->storage);
 	ofi_atomic_dec32(&cache->domain->ref);
 	ofi_bufpool_destroy(cache->entry_pool);
@@ -519,8 +538,9 @@ static int ofi_mr_cache_init_storage(struct ofi_mr_cache *cache)
 	return ret;
 }
 
+/* Monitors array must be of size OFI_HMEM_MAX. */
 int ofi_mr_cache_init(struct util_domain *domain,
-		      struct ofi_mem_monitor *monitor,
+		      struct ofi_mem_monitor **monitors,
 		      struct ofi_mr_cache *cache)
 {
 	int ret;
@@ -547,9 +567,9 @@ int ofi_mr_cache_init(struct util_domain *domain,
 	if (ret)
 		goto dec;
 
-	ret = ofi_monitor_add_cache(monitor, cache);
+	ret = ofi_monitors_add_cache(monitors, cache);
 	if (ret)
-		goto destroy;
+		goto del;
 
 	ret = ofi_bufpool_create(&cache->entry_pool,
 				 sizeof(struct ofi_mr_entry) +
@@ -560,8 +580,7 @@ int ofi_mr_cache_init(struct util_domain *domain,
 
 	return 0;
 del:
-	ofi_monitor_del_cache(cache);
-destroy:
+	ofi_monitors_del_cache(cache);
 	cache->storage.destroy(&cache->storage);
 dec:
 	ofi_atomic_dec32(&cache->domain->ref);
diff --git a/deps/libfabric/prov/util/src/util_mr_map.c b/deps/libfabric/prov/util/src/util_mr_map.c
index 2157b702b42a4861ceaf641e7e7c4b7841f0b3c3..78e6459f5c8763f873852f969e3d626c52d2b8dd 100644
--- a/deps/libfabric/prov/util/src/util_mr_map.c
+++ b/deps/libfabric/prov/util/src/util_mr_map.c
@@ -219,10 +219,39 @@ static struct fi_ops ofi_mr_fi_ops = {
 	.ops_open = fi_no_ops_open
 };
 
+void ofi_mr_update_attr(uint32_t user_version, uint64_t caps,
+			const struct fi_mr_attr *user_attr,
+			struct fi_mr_attr *cur_abi_attr)
+{
+	cur_abi_attr->mr_iov = (struct iovec *) user_attr->mr_iov;
+	cur_abi_attr->iov_count = user_attr->iov_count;
+	cur_abi_attr->access = user_attr->access;
+	cur_abi_attr->offset = user_attr->offset;
+	cur_abi_attr->requested_key = user_attr->requested_key;
+	cur_abi_attr->context = user_attr->context;
+
+	if (FI_VERSION_GE(user_version, FI_VERSION(1, 5))) {
+		cur_abi_attr->auth_key_size = user_attr->auth_key_size;
+		cur_abi_attr->auth_key = user_attr->auth_key;
+	} else {
+		cur_abi_attr->auth_key_size = 0;
+		cur_abi_attr->auth_key = NULL;
+	}
+
+	if (caps & FI_HMEM) {
+		cur_abi_attr->iface = user_attr->iface;
+		cur_abi_attr->device = user_attr->device;
+	} else {
+		cur_abi_attr->iface = FI_HMEM_SYSTEM;
+		cur_abi_attr->device.reserved = 0;
+	}
+}
+
 int ofi_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 		   uint64_t flags, struct fid_mr **mr_fid)
 {
 	struct util_domain *domain;
+	struct fi_mr_attr cur_abi_attr;
 	struct ofi_mr *mr;
 	uint64_t key;
 	int ret = 0;
@@ -235,6 +264,8 @@ int ofi_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 	if (!mr)
 		return -FI_ENOMEM;
 
+	ofi_mr_update_attr(domain->fabric->fabric_fid.api_version,
+			   domain->info_domain_caps, attr, &cur_abi_attr);
 	fastlock_acquire(&domain->lock);
 
 	mr->mr_fid.fid.fclass = FI_CLASS_MR;
@@ -242,15 +273,17 @@ int ofi_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
 	mr->mr_fid.fid.ops = &ofi_mr_fi_ops;
 	mr->domain = domain;
 	mr->flags = flags;
+	mr->iface = cur_abi_attr.iface;
+	mr->device = cur_abi_attr.device.reserved;
 
-	ret = ofi_mr_map_insert(&domain->mr_map, attr, &key, mr);
+	ret = ofi_mr_map_insert(&domain->mr_map, &cur_abi_attr, &key, mr);
 	if (ret) {
 		free(mr);
 		goto out;
 	}
 
 	mr->mr_fid.key = mr->key = key;
-	mr->mr_fid.mem_desc = (void *) (uintptr_t) key;
+	mr->mr_fid.mem_desc = (void *) mr;
 
 	*mr_fid = &mr->mr_fid;
 	ofi_atomic_inc32(&domain->ref);
@@ -273,6 +306,9 @@ int ofi_mr_regv(struct fid *fid, const struct iovec *iov,
 	attr.offset = offset;
 	attr.requested_key = requested_key;
 	attr.context = context;
+	attr.iface = FI_HMEM_SYSTEM;
+	attr.device.reserved = 0;
+
 	return ofi_mr_regattr(fid, &attr, flags, mr_fid);
 }
 
diff --git a/deps/libfabric/prov/util/src/util_shm.c b/deps/libfabric/prov/util/src/util_shm.c
index 727290745475cf53f10aa2c27e306c1194a3f00b..72301b62dbb607da7562c8bc3713823e1dcb060c 100644
--- a/deps/libfabric/prov/util/src/util_shm.c
+++ b/deps/libfabric/prov/util/src/util_shm.c
@@ -39,6 +39,7 @@
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#include <stdio.h>
 
 #include <ofi_shm.h>
 
@@ -60,8 +61,8 @@ void smr_cleanup(void)
 
 static void smr_peer_addr_init(struct smr_addr *peer)
 {
-	memset(peer->name, 0, NAME_MAX);
-	peer->addr = FI_ADDR_UNSPEC;
+	memset(peer->name, 0, SMR_NAME_MAX);
+	peer->id = -1;
 }
 
 void smr_cma_check(struct smr_region *smr, struct smr_region *peer_smr)
@@ -120,9 +121,9 @@ size_t smr_calculate_size_offsets(size_t tx_count, size_t rx_count,
 	if (name_offset)
 		*name_offset = ep_name_offset;
 
-	total_size = ep_name_offset + NAME_MAX;
+	total_size = ep_name_offset + SMR_NAME_MAX;
 
-	/* 
+	/*
  	 * Revisit later to see if we really need the size adjustment, or
  	 * at most align to a multiple of a page size.
  	 */
@@ -133,7 +134,7 @@ size_t smr_calculate_size_offsets(size_t tx_count, size_t rx_count,
 
 /* TODO: Determine if aligning SMR data helps performance */
 int smr_create(const struct fi_provider *prov, struct smr_map *map,
-	       const struct smr_attr *attr, struct smr_region **smr)
+	       const struct smr_attr *attr, struct smr_region *volatile *smr)
 {
 	struct smr_ep_name *ep_name;
 	size_t total_size, cmd_queue_offset, peer_data_offset;
@@ -153,16 +154,17 @@ int smr_create(const struct fi_provider *prov, struct smr_map *map,
 	fd = shm_open(attr->name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
 	if (fd < 0) {
 		FI_WARN(prov, FI_LOG_EP_CTRL, "shm_open error\n");
-		goto err1;
+		return -errno;
 	}
 
 	ep_name = calloc(1, sizeof(*ep_name));
 	if (!ep_name) {
 		FI_WARN(prov, FI_LOG_EP_CTRL, "calloc error\n");
-		return -FI_ENOMEM;
+		ret = -FI_ENOMEM;
+		goto close;
 	}
-	strncpy(ep_name->name, (char *)attr->name, NAME_MAX - 1);
-	ep_name->name[NAME_MAX - 1] = '\0';
+	strncpy(ep_name->name, (char *)attr->name, SMR_NAME_MAX - 1);
+	ep_name->name[SMR_NAME_MAX - 1] = '\0';
 
 	pthread_mutex_lock(&ep_list_lock);
 	dlist_insert_tail(&ep_name->entry, &ep_name_list);
@@ -170,14 +172,16 @@ int smr_create(const struct fi_provider *prov, struct smr_map *map,
 	ret = ftruncate(fd, total_size);
 	if (ret < 0) {
 		FI_WARN(prov, FI_LOG_EP_CTRL, "ftruncate error\n");
-		goto err2;
+		ret = -errno;
+		goto remove;
 	}
 
 	mapped_addr = mmap(NULL, total_size, PROT_READ | PROT_WRITE,
 			   MAP_SHARED, fd, 0);
 	if (mapped_addr == MAP_FAILED) {
 		FI_WARN(prov, FI_LOG_EP_CTRL, "mmap error\n");
-		goto err2;
+		ret = -errno;
+		goto remove;
 	}
 
 	close(fd);
@@ -187,12 +191,10 @@ int smr_create(const struct fi_provider *prov, struct smr_map *map,
 
 	*smr = mapped_addr;
 	fastlock_init(&(*smr)->lock);
-	fastlock_acquire(&(*smr)->lock);
 
 	(*smr)->map = map;
 	(*smr)->version = SMR_VERSION;
 	(*smr)->flags = SMR_FLAG_ATOMIC | SMR_FLAG_DEBUG;
-	(*smr)->pid = getpid();
 	(*smr)->cma_cap = SMR_CMA_CAP_NA;
 	(*smr)->base_addr = *smr;
 
@@ -210,23 +212,27 @@ int smr_create(const struct fi_provider *prov, struct smr_map *map,
 	smr_cmd_queue_init(smr_cmd_queue(*smr), rx_size);
 	smr_resp_queue_init(smr_resp_queue(*smr), tx_size);
 	smr_inject_pool_init(smr_inject_pool(*smr), rx_size);
-	smr_sar_pool_init(smr_sar_pool(*smr), SMR_MAX_PEERS); 
+	smr_sar_pool_init(smr_sar_pool(*smr), SMR_MAX_PEERS);
 	for (i = 0; i < SMR_MAX_PEERS; i++) {
 		smr_peer_addr_init(&smr_peer_data(*smr)[i].addr);
 		smr_peer_data(*smr)[i].sar_status = 0;
+		smr_peer_data(*smr)[i].name_sent = 0;
 	}
 
 	strncpy((char *) smr_name(*smr), attr->name, total_size - name_offset);
-	fastlock_release(&(*smr)->lock);
 
+	/* Must be set last to signal full initialization to peers */
+	(*smr)->pid = getpid();
 	return 0;
 
-err2:
-	shm_unlink(attr->name);
-	close(fd);
+remove:
+	dlist_remove(&ep_name->entry);
 	pthread_mutex_unlock(&ep_list_lock);
-err1:
-	return -errno;
+	free(ep_name);
+close:
+	close(fd);
+	shm_unlink(attr->name);
+	return ret;
 }
 
 void smr_free(struct smr_region *smr)
@@ -235,6 +241,16 @@ void smr_free(struct smr_region *smr)
 	munmap(smr, smr->total_size);
 }
 
+static int smr_name_compare(struct ofi_rbmap *map, void *key, void *data)
+{
+	struct smr_map *smr_map;
+
+	smr_map = container_of(map, struct smr_map, rbmap);
+
+	return strncmp(smr_map->peers[(int64_t) data].peer.name,
+		       (char *) key, SMR_NAME_MAX);
+}
+
 int smr_map_create(const struct fi_provider *prov, int peer_count,
 		   struct smr_map **map)
 {
@@ -246,9 +262,12 @@ int smr_map_create(const struct fi_provider *prov, int peer_count,
 		return -FI_ENOMEM;
 	}
 
-	for (i = 0; i < peer_count; i++)
+	for (i = 0; i < peer_count; i++) {
 		smr_peer_addr_init(&(*map)->peers[i].peer);
+		(*map)->peers[i].fiaddr = FI_ADDR_UNSPEC;
+	}
 
+	ofi_rbmap_init(&(*map)->rbmap, smr_name_compare);
 	fastlock_init(&(*map)->lock);
 
 	return 0;
@@ -310,108 +329,126 @@ out:
 	return ret;
 }
 
-void smr_map_to_endpoint(struct smr_region *region, int index)
+void smr_map_to_endpoint(struct smr_region *region, int64_t id)
 {
 	struct smr_region *peer_smr;
-	struct smr_peer_data *local_peers, *peer_peers;
-	int peer_index;
+	struct smr_peer_data *local_peers;
+
+	if (region->map->peers[id].peer.id < 0)
+		return;
 
 	local_peers = smr_peer_data(region);
 
-	strncpy(smr_peer_data(region)[index].addr.name,
-		region->map->peers[index].peer.name, NAME_MAX - 1);
-	smr_peer_data(region)[index].addr.name[NAME_MAX - 1] = '\0';
-	if (region->map->peers[index].peer.addr == FI_ADDR_UNSPEC)
-		return;
+	strncpy(local_peers[id].addr.name,
+		region->map->peers[id].peer.name, SMR_NAME_MAX - 1);
+	local_peers[id].addr.name[SMR_NAME_MAX - 1] = '\0';
 
-	peer_smr = smr_peer_region(region, index);
-	peer_peers = smr_peer_data(peer_smr);
+	peer_smr = smr_peer_region(region, id);
 
-	if (region->cma_cap == SMR_CMA_CAP_NA)
+	if (region->cma_cap == SMR_CMA_CAP_NA && region != peer_smr)
 		smr_cma_check(region, peer_smr);
-
-	for (peer_index = 0; peer_index < SMR_MAX_PEERS; peer_index++) {
-		if (!strncmp(smr_name(region),
-		    peer_peers[peer_index].addr.name, NAME_MAX))
-			break;
-	}
-	if (peer_index != SMR_MAX_PEERS) {
-		peer_peers[peer_index].addr.addr = index;
-		local_peers[index].addr.addr = peer_index;
-	}
 }
 
-void smr_unmap_from_endpoint(struct smr_region *region, int index)
+void smr_unmap_from_endpoint(struct smr_region *region, int64_t id)
 {
 	struct smr_region *peer_smr;
 	struct smr_peer_data *local_peers, *peer_peers;
-	int peer_index;
+	int64_t peer_id;
 
 	local_peers = smr_peer_data(region);
 
-	memset(local_peers[index].addr.name, 0, NAME_MAX);
-	peer_index = region->map->peers[index].peer.addr;
-	if (peer_index == FI_ADDR_UNSPEC)
+	memset(local_peers[id].addr.name, 0, SMR_NAME_MAX);
+	peer_id = region->map->peers[id].peer.id;
+	if (peer_id < 0)
 		return;
 
-	peer_smr = smr_peer_region(region, index);
+	peer_smr = smr_peer_region(region, id);
 	peer_peers = smr_peer_data(peer_smr);
 
-	peer_peers[peer_index].addr.addr = FI_ADDR_UNSPEC;
+	peer_peers[peer_id].addr.id = -1;
+	peer_peers[peer_id].name_sent = 0;
 }
 
 void smr_exchange_all_peers(struct smr_region *region)
 {
-	int i;
+	int64_t i;
 	for (i = 0; i < SMR_MAX_PEERS; i++)
 		smr_map_to_endpoint(region, i);
 }
 
 int smr_map_add(const struct fi_provider *prov, struct smr_map *map,
-		const char *name, int id)
+		const char *name, int64_t *id)
 {
-	int ret = 0;
+	struct ofi_rbnode *node;
+	int tries = 0, ret = 0;
 
 	fastlock_acquire(&map->lock);
-	strncpy(map->peers[id].peer.name, name, NAME_MAX);
-	map->peers[id].peer.name[NAME_MAX - 1] = '\0';
-	ret = smr_map_to_region(prov, &map->peers[id]);
+	ret = ofi_rbmap_insert(&map->rbmap, (void *) name, (void *) *id, &node);
+	if (ret) {
+		assert(ret == -FI_EALREADY);
+		*id = (int64_t) node->data;
+		fastlock_release(&map->lock);
+		return 0;
+	}
+
+	while (map->peers[map->cur_id].peer.id != -1 &&
+	       tries < SMR_MAX_PEERS) {
+		if (++map->cur_id == SMR_MAX_PEERS)
+			map->cur_id = 0;
+		tries++;
+	}
+
+	assert(map->cur_id < SMR_MAX_PEERS && tries < SMR_MAX_PEERS);
+	*id = map->cur_id;
+	node->data = (void *) *id;
+	strncpy(map->peers[*id].peer.name, name, SMR_NAME_MAX);
+	map->peers[*id].peer.name[SMR_NAME_MAX - 1] = '\0';
+
+	ret = smr_map_to_region(prov, &map->peers[*id]);
 	if (!ret)
-		map->peers[id].peer.addr = id;
-	fastlock_release(&map->lock);
+		map->peers[*id].peer.id = *id;
 
+	fastlock_release(&map->lock);
 	return ret == -ENOENT ? 0 : ret;
 }
 
-void smr_map_del(struct smr_map *map, int id)
+void smr_map_del(struct smr_map *map, int64_t id)
 {
 	struct dlist_entry *entry;
 
-	if (id >= SMR_MAX_PEERS || id < 0 ||
-	    map->peers[id].peer.addr == FI_ADDR_UNSPEC)
+	if (id >= SMR_MAX_PEERS || id < 0 || map->peers[id].peer.id < 0)
 		return;
 
 	pthread_mutex_lock(&ep_list_lock);
 	entry = dlist_find_first_match(&ep_name_list, smr_match_name,
 				       map->peers[id].peer.name);
 	pthread_mutex_unlock(&ep_list_lock);
+
+	fastlock_acquire(&map->lock);
 	if (!entry)
 		munmap(map->peers[id].region, map->peers[id].region->total_size);
 
-	map->peers[id].peer.addr = FI_ADDR_UNSPEC;
+	(void) ofi_rbmap_find_delete(&map->rbmap,
+				     (void *) map->peers[id].peer.name);
+
+	map->peers[id].fiaddr = FI_ADDR_UNSPEC;	
+	map->peers[id].peer.id = -1;
+
+	fastlock_release(&map->lock);
 }
 
 void smr_map_free(struct smr_map *map)
 {
-	int i;
+	int64_t i;
 
 	for (i = 0; i < SMR_MAX_PEERS; i++)
 		smr_map_del(map, i);
 
+	ofi_rbmap_cleanup(&map->rbmap);
 	free(map);
 }
 
-struct smr_region *smr_map_get(struct smr_map *map, int id)
+struct smr_region *smr_map_get(struct smr_map *map, int64_t id)
 {
 	if (id < 0 || id >= SMR_MAX_PEERS)
 		return NULL;
diff --git a/deps/libfabric/prov/util/src/util_wait.c b/deps/libfabric/prov/util/src/util_wait.c
index eca6c3d231281a35cff4d93049f2d1351bee621b..07d6a7a63735ee9012c9e374267ba807ab38c464 100644
--- a/deps/libfabric/prov/util/src/util_wait.c
+++ b/deps/libfabric/prov/util/src/util_wait.c
@@ -180,7 +180,7 @@ static int ofi_wait_match_fd(struct dlist_entry *item, const void *arg)
 	return fd_entry->fd == *(int *) arg;
 }
 
-static int ofi_wait_fdset_del(struct util_wait_fd *wait_fd, int fd)
+int ofi_wait_fdset_del(struct util_wait_fd *wait_fd, int fd)
 {
 	wait_fd->change_index++;
 
@@ -190,7 +190,7 @@ static int ofi_wait_fdset_del(struct util_wait_fd *wait_fd, int fd)
 }
 
 static int ofi_wait_fdset_add(struct util_wait_fd *wait_fd, int fd,
-			       uint32_t events, void *context)
+			      uint32_t events, void *context)
 {
 	int ret;
 
@@ -484,7 +484,7 @@ static int util_wait_fd_close(struct fid *fid)
 	if (wait->util_wait.wait_obj == FI_WAIT_FD)
 		ofi_epoll_close(wait->epoll_fd);
 	else
-		ofi_epoll_close(wait->epoll_fd);
+		ofi_pollfds_close(wait->pollfds);
 	free(wait);
 	return 0;
 }
diff --git a/deps/libfabric/prov/verbs/src/fi_verbs.c b/deps/libfabric/prov/verbs/src/fi_verbs.c
index 9ddb58cf4f0719001df0c2f67743d293f08d8227..1bdbe8a64c758251fa0ecd7665beb00e65082c8c 100644
--- a/deps/libfabric/prov/verbs/src/fi_verbs.c
+++ b/deps/libfabric/prov/verbs/src/fi_verbs.c
@@ -35,6 +35,7 @@
 #include <ofi_mem.h>
 
 #include "fi_verbs.h"
+#include "ofi_hmem.h"
 
 static void vrb_fini(void);
 
@@ -101,13 +102,17 @@ int vrb_sockaddr_len(struct sockaddr *addr)
 		return ofi_sizeofaddr(addr);
 }
 
-int vrb_get_rdma_rai(const char *node, const char *service, uint64_t flags,
-		   const struct fi_info *hints, struct rdma_addrinfo **rai)
+static int
+vrb_get_rdmacm_rai(const char *node, const char *service, uint64_t flags,
+		uint32_t addr_format, void *src_addr, size_t src_addrlen,
+		void *dest_addr, size_t dest_addrlen, struct rdma_addrinfo **rai)
 {
 	struct rdma_addrinfo rai_hints, *_rai;
-	struct rdma_addrinfo **rai_current;
-	int ret = vrb_fi_to_rai(hints, flags, &rai_hints);
+	struct rdma_addrinfo **cur, *next;
+	int ret;
 
+	ret = vrb_set_rai(addr_format, src_addr, src_addrlen, dest_addr,
+				       dest_addrlen, flags, &rai_hints);
 	if (ret)
 		goto out;
 
@@ -117,37 +122,32 @@ int vrb_get_rdma_rai(const char *node, const char *service, uint64_t flags,
 		rai_hints.ai_flags |= RAI_PASSIVE;
 	}
 
-	ret = rdma_getaddrinfo((char *) node, (char *) service,
-				&rai_hints, &_rai);
+	ret = rdma_getaddrinfo(node, service, &rai_hints, &_rai);
 	if (ret) {
 		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_getaddrinfo", errno);
-		if (errno) {
+		if (errno)
 			ret = -errno;
-		}
 		goto out;
 	}
 
 	/*
-	 * If caller requested rai, remove ib_rai entries added by IBACM to
+	 * Remove ib_rai entries added by IBACM to
 	 * prevent wrong ib_connect_hdr from being sent in connect request.
 	 */
-	if (rai && hints && (hints->addr_format != FI_SOCKADDR_IB)) {
-		for (rai_current = &_rai; *rai_current;) {
-			struct rdma_addrinfo *rai_next;
-			if ((*rai_current)->ai_family == AF_IB) {
-				rai_next = (*rai_current)->ai_next;
-				(*rai_current)->ai_next = NULL;
-				rdma_freeaddrinfo(*rai_current);
-				*rai_current = rai_next;
-				continue;
+	if (addr_format && (addr_format != FI_SOCKADDR_IB)) {
+		for (cur = &_rai; *cur; ) {
+			if ((*cur)->ai_family == AF_IB) {
+				next = (*cur)->ai_next;
+				(*cur)->ai_next = NULL;
+				rdma_freeaddrinfo(*cur);
+				*cur = next;
+			} else {
+				cur = &(*cur)->ai_next;
 			}
-			rai_current = &(*rai_current)->ai_next;
 		}
 	}
 
-	if (rai)
-		*rai = _rai;
-
+	*rai = _rai;
 out:
 	if (rai_hints.ai_src_addr)
 		free(rai_hints.ai_src_addr);
@@ -156,6 +156,103 @@ out:
 	return ret;
 }
 
+static int
+vrb_get_sib_rai(const char *node, const char *service, uint64_t flags,
+		uint32_t addr_format, void *src_addr, size_t src_addrlen,
+		void *dest_addr, size_t dest_addrlen, struct rdma_addrinfo **rai)
+{
+	struct sockaddr_ib *sib;
+	size_t sib_len;
+	char *straddr;
+	uint32_t fmt;
+	int ret;
+	bool has_prefix;
+	const char *prefix = "fi_sockaddr_ib://";
+
+	*rai = calloc(1, sizeof(struct rdma_addrinfo));
+	if (*rai == NULL)
+		return -FI_ENOMEM;
+
+	ret = vrb_set_rai(addr_format, src_addr, src_addrlen, dest_addr, 
+						 dest_addrlen, flags, *rai);
+	if (ret)
+		return ret;
+
+	if (node) {
+		fmt = ofi_addr_format(node);
+		if (fmt == FI_SOCKADDR_IB)
+			has_prefix = true;
+		else if (fmt == FI_FORMAT_UNSPEC)
+			has_prefix = false;
+		else
+			return -FI_EINVAL;
+
+		if (service) {
+			ret = asprintf(&straddr, "%s%s:%s", has_prefix ? "" : prefix,
+				       node, service);
+		} else {
+			ret = asprintf(&straddr, "%s%s", has_prefix ? "" : prefix, node);
+		}
+
+		if (ret == -1)
+			return -FI_ENOMEM;
+
+		ret = ofi_str_toaddr(straddr, &fmt, (void **)&sib, &sib_len);
+		free(straddr);
+
+		if (ret || fmt != FI_SOCKADDR_IB) {
+			return -FI_EINVAL;
+		}
+
+		if (flags & FI_SOURCE) {
+			(*rai)->ai_flags |= RAI_PASSIVE;
+			if ((*rai)->ai_src_addr)
+				free((*rai)->ai_src_addr);
+			(*rai)->ai_src_addr = (void *)sib;
+			(*rai)->ai_src_len = sizeof(struct sockaddr_ib);
+		} else {
+			if ((*rai)->ai_dst_addr)
+				free((*rai)->ai_dst_addr);
+			(*rai)->ai_dst_addr = (void *)sib;
+			(*rai)->ai_dst_len = sizeof(struct sockaddr_ib);
+		}
+
+	} else if (service) {
+		if ((flags & FI_SOURCE) && (*rai)->ai_src_addr) {
+			if ((*rai)->ai_src_len < sizeof(struct sockaddr_ib))
+				return -FI_EINVAL;
+
+			(*rai)->ai_src_len = sizeof(struct sockaddr_ib);
+			sib = (struct sockaddr_ib *)(*rai)->ai_src_addr;
+		} else {
+			if ((*rai)->ai_dst_len < sizeof(struct sockaddr_ib))
+				return -FI_EINVAL;
+
+			(*rai)->ai_dst_len = sizeof(struct sockaddr_ib);
+			sib = (struct sockaddr_ib *)(*rai)->ai_dst_addr;
+		}
+
+		sib->sib_sid = htonll(((uint64_t) RDMA_PS_IB << 16) + (uint16_t)atoi(service));
+		sib->sib_sid_mask = htonll(OFI_IB_IP_PS_MASK | OFI_IB_IP_PORT_MASK);
+	}
+
+	return 0;
+}
+
+static int
+vrb_get_rdma_rai(const char *node, const char *service, uint32_t addr_format,
+		void *src_addr, size_t src_addrlen, void *dest_addr,
+		size_t dest_addrlen, uint64_t flags, struct rdma_addrinfo **rai)
+{
+	if (addr_format == FI_SOCKADDR_IB && (node || src_addr || dest_addr)) {
+		return vrb_get_sib_rai(node, service, flags, addr_format, src_addr,
+					src_addrlen, dest_addr, dest_addrlen, rai);
+	}
+
+	return vrb_get_rdmacm_rai(node, service, flags, addr_format, src_addr,
+					src_addrlen, dest_addr, dest_addrlen, rai);
+}
+
 int vrb_get_rai_id(const char *node, const char *service, uint64_t flags,
 		      const struct fi_info *hints, struct rdma_addrinfo **rai,
 		      struct rdma_cm_id **id)
@@ -163,11 +260,19 @@ int vrb_get_rai_id(const char *node, const char *service, uint64_t flags,
 	int ret;
 
 	// TODO create a similar function that won't require pruning ib_rai
-	ret = vrb_get_rdma_rai(node, service, flags, hints, rai);
+	if (hints) {
+		ret = vrb_get_rdma_rai(node, service, hints->addr_format, hints->src_addr,
+				       hints->src_addrlen, hints->dest_addr,
+				       hints->dest_addrlen, flags, rai);
+	} else {
+		ret = vrb_get_rdma_rai(node, service, FI_FORMAT_UNSPEC, NULL, 0, NULL,
+				       0, flags, rai);
+	}
 	if (ret)
 		return ret;
 
-	ret = rdma_create_id(NULL, id, NULL, RDMA_PS_TCP);
+	ret = rdma_create_id(NULL, id, NULL, vrb_get_port_space(hints ? hints->addr_format:
+					FI_FORMAT_UNSPEC));
 	if (ret) {
 		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_create_id", errno);
 		ret = -errno;
@@ -206,13 +311,16 @@ err1:
 	return ret;
 }
 
-int vrb_create_ep(const struct fi_info *hints, enum rdma_port_space ps,
+int vrb_create_ep(struct vrb_ep *ep, enum rdma_port_space ps,
 		     struct rdma_cm_id **id)
 {
 	struct rdma_addrinfo *rai = NULL;
 	int ret;
 
-	ret = vrb_get_rdma_rai(NULL, NULL, 0, hints, &rai);
+	ret = vrb_get_rdma_rai(NULL, NULL, ep->info_attr.addr_format,
+				ep->info_attr.src_addr, ep->info_attr.src_addrlen,
+				ep->info_attr.dest_addr, ep->info_attr.dest_addrlen,
+				0, &rai);
 	if (ret) {
 		return ret;
 	}
@@ -243,7 +351,9 @@ int vrb_create_ep(const struct fi_info *hints, enum rdma_port_space ps,
 				"dst addr", rai->ai_dst_addr);
 		goto err2;
 	}
+	rdma_freeaddrinfo(rai);
 	return 0;
+
 err2:
 	rdma_destroy_id(*id);
 err1:
@@ -667,7 +777,8 @@ static void verbs_devs_free(void)
 static void vrb_fini(void)
 {
 #if HAVE_VERBS_DL
-	ofi_monitor_cleanup();
+	ofi_monitors_cleanup();
+	ofi_hmem_cleanup();
 	ofi_mem_fini();
 #endif
 	fi_freeinfo((void *)vrb_util_prov.info);
@@ -679,7 +790,8 @@ VERBS_INI
 {
 #if HAVE_VERBS_DL
 	ofi_mem_init();
-	ofi_monitor_init();
+	ofi_hmem_init();
+	ofi_monitors_init();
 #endif
 	if (vrb_read_params()|| vrb_init_info(&vrb_util_prov.info))
 		return NULL;
diff --git a/deps/libfabric/prov/verbs/src/fi_verbs.h b/deps/libfabric/prov/verbs/src/fi_verbs.h
index 45b4963ada54586629f56e5792ad0b3a696c2ba2..7df34020f64731351716b9d2bf79801bb242de75 100644
--- a/deps/libfabric/prov/verbs/src/fi_verbs.h
+++ b/deps/libfabric/prov/verbs/src/fi_verbs.h
@@ -1,6 +1,8 @@
 /*
  * Copyright (c) 2013-2018 Intel Corporation, Inc.  All rights reserved.
  * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2018-2019 Cray Inc. All rights reserved.
+ * Copyright (c) 2018-2019 System Fabric Works, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -95,14 +97,14 @@
 
 
 #define VERBS_INJECT_FLAGS(ep, len, flags) ((((flags) & FI_INJECT) || \
-		len <= (ep)->inject_limit) ? IBV_SEND_INLINE : 0)
-#define VERBS_INJECT(ep, len) VERBS_INJECT_FLAGS(ep, len, (ep)->info->tx_attr->op_flags)
+		len <= (ep)->info_attr.inject_size) ? IBV_SEND_INLINE : 0)
+#define VERBS_INJECT(ep, len) VERBS_INJECT_FLAGS(ep, len, (ep)->util_ep.tx_op_flags)
 
 #define VERBS_COMP_FLAGS(ep, flags, context)		\
 	(((ep)->util_ep.tx_op_flags | (flags)) &		\
 	 FI_COMPLETION ? context : VERBS_NO_COMP_FLAG)
 #define VERBS_COMP(ep, context)						\
-	VERBS_COMP_FLAGS((ep), (ep)->info->tx_attr->op_flags, context)
+	VERBS_COMP_FLAGS((ep), (ep)->util_ep.tx_op_flags, context)
 
 #define VERBS_WCE_CNT 1024
 #define VERBS_WRE_CNT 1024
@@ -116,10 +118,10 @@
 #define VERBS_CM_DATA_SIZE	(VRB_CM_DATA_SIZE -		\
 				 sizeof(struct vrb_cm_data_hdr))
 
-#define VRB_CM_REJ_CONSUMER_DEFINED	28
+#define VRB_CM_REJ_CONSUMER_DEFINED		28
 #define VRB_CM_REJ_SIDR_CONSUMER_DEFINED	2
 
-#define VERBS_DGRAM_MSG_PREFIX_SIZE	(40)
+#define VERBS_DGRAM_MSG_PREFIX_SIZE		(40)
 
 #define VRB_EP_TYPE(info)						\
 	((info && info->ep_attr) ? info->ep_attr->type : FI_EP_MSG)
@@ -209,17 +211,18 @@ struct ofi_ib_ud_ep_name {
 static inline
 int vrb_dgram_ns_is_service_wildcard(void *svc)
 {
-	return (*(int *)svc == VERBS_IB_UD_NS_ANY_SERVICE);
+	return (*(int *) svc == VERBS_IB_UD_NS_ANY_SERVICE);
 }
 
 static inline
 int vrb_dgram_ns_service_cmp(void *svc1, void *svc2)
 {
-	int service1 = *(int *)svc1, service2 = *(int *)svc2;
+	int service1 = *(int *) svc1, service2 = *(int *) svc2;
 
 	if (vrb_dgram_ns_is_service_wildcard(svc1) ||
 	    vrb_dgram_ns_is_service_wildcard(svc2))
 		return 0;
+
 	return (service1 < service2) ? -1 : (service1 > service2);
 }
 
@@ -312,7 +315,7 @@ int vrb_av_open(struct fid_domain *domain, struct fi_av_attr *attr,
 
 struct vrb_pep {
 	struct fid_pep		pep_fid;
-	struct vrb_eq	*eq;
+	struct vrb_eq		*eq;
 	struct rdma_cm_id	*id;
 
 	/* XRC uses SIDR based RDMA CM exchanges for setting up
@@ -347,10 +350,13 @@ struct vrb_domain {
 
 	enum fi_ep_type			ep_type;
 	struct fi_info			*info;
+
 	/* The EQ is utilized by verbs/MSG */
-	struct vrb_eq		*eq;
+	struct vrb_eq			*eq;
 	uint64_t			eq_flags;
 
+	ssize_t		(*send_credits)(struct fid_ep *ep, uint64_t credits);
+
 	/* Indicates that MSG endpoints should use the XRC transport.
 	 * TODO: Move selection of XRC/RC to endpoint info from domain */
 	int				flags;
@@ -358,11 +364,15 @@ struct vrb_domain {
 		int			xrcd_fd;
 		struct ibv_xrcd		*xrcd;
 
-		/* The domain maintains a RBTree for mapping an endpoint
-		 * destination addresses to physical XRC INI QP connected
-		 * to that host. The map is protected using the EQ lock
-		 * bound to the domain to avoid the need for additional
-		 * locking. */
+		/* XRC INI QP connections can be shared between endpoint
+		 * within the same domain. The domain maintains an RBTree
+		 * for mapping endpoint destination addresses to the
+		 * physical XRC INI connection to the associated node. The
+		 * map and XRC INI connection object state information are
+		 * protected via the ini_lock. */
+		fastlock_t		ini_lock;
+		ofi_fastlock_acquire_t	lock_acquire;
+		ofi_fastlock_release_t	lock_release;
 		struct ofi_rbmap	*ini_conn_rbmap;
 	} xrc;
 
@@ -471,12 +481,6 @@ struct vrb_srq_ep {
 int vrb_srq_context(struct fid_domain *domain, struct fi_rx_attr *attr,
 		       struct fid_ep **rx_ep, void *context);
 
-static inline int vrb_is_xrc(struct fi_info *info)
-{
-	return (VRB_EP_TYPE(info) == FI_EP_MSG) &&
-	       (VRB_EP_PROTO(info) == FI_PROTO_RDMA_CM_IB_XRC);
-}
-
 int vrb_domain_xrc_init(struct vrb_domain *domain);
 int vrb_domain_xrc_cleanup(struct vrb_domain *domain);
 
@@ -498,14 +502,14 @@ enum vrb_ini_qp_state {
 struct vrb_ini_shared_conn {
 	/* To share, EP must have same remote peer host addr and TX CQ */
 	struct sockaddr			*peer_addr;
-	struct vrb_cq		*tx_cq;
+	struct vrb_cq			*tx_cq;
 
 	/* The physical INI/TGT QPN connection. Virtual connections to the
 	 * same remote peer and TGT QPN will share this connection, with
 	 * the remote end opening the specified XRC TGT QPN for sharing
 	 * During the physical connection setup, phys_conn_id identifies
 	 * the RDMA CM ID (and MSG_EP) associated with the operation. */
-	enum vrb_ini_qp_state	state;
+	enum vrb_ini_qp_state		state;
 	struct rdma_cm_id		*phys_conn_id;
 	struct ibv_qp			*ini_qp;
 	uint32_t			tgt_qpn;
@@ -560,21 +564,36 @@ struct vrb_ep {
 	struct ibv_qp			*ibv_qp;
 
 	/* Protected by send CQ lock */
-	size_t				tx_credits;
+	uint64_t			sq_credits;
+	uint64_t			peer_rq_credits;
+	/* Protected by recv CQ lock */
+	int64_t				rq_credits_avail;
+	int64_t				threshold;
 
 	union {
-		struct rdma_cm_id		*id;
+		struct rdma_cm_id	*id;
 		struct {
 			struct ofi_ib_ud_ep_name	ep_name;
 			int				service;
 		};
 	};
 
-	size_t				inject_limit;
-
-	struct vrb_eq		*eq;
+	struct {
+		size_t			inject_size;
+		size_t                  tx_size;
+		size_t                  tx_iov_limit;
+		size_t                  rx_size;
+		size_t                  rx_iov_limit;
+		uint32_t                protocol;
+		uint32_t                addr_format;
+		size_t                  src_addrlen;
+		size_t                  dest_addrlen;
+		void                    *src_addr;
+		void                    *dest_addr;
+		void                    *handle;
+	} info_attr;
+	struct vrb_eq			*eq;
 	struct vrb_srq_ep		*srq_ep;
-	struct fi_info			*info;
 
 	struct {
 		struct ibv_send_wr	rma_wr;
@@ -583,7 +602,8 @@ struct vrb_ep {
 	} *wrs;
 	size_t				rx_cq_size;
 	struct rdma_conn_param		conn_param;
-	struct vrb_cm_data_hdr	*cm_hdr;
+	struct vrb_cm_data_hdr		*cm_hdr;
+	void				*cm_priv_data;
 };
 
 
@@ -599,7 +619,7 @@ struct vrb_context {
 #define VERBS_XRC_EP_MAGIC		0x1F3D5B79
 struct vrb_xrc_ep {
 	/* Must be first */
-	struct vrb_ep		base_ep;
+	struct vrb_ep			base_ep;
 
 	/* XRC only fields */
 	struct rdma_cm_id		*tgt_id;
@@ -628,11 +648,23 @@ struct vrb_xrc_ep {
 	struct vrb_xrc_ep_conn_setup	*conn_setup;
 };
 
+static inline int vrb_is_xrc_info(struct fi_info *info)
+{
+	return (VRB_EP_TYPE(info) == FI_EP_MSG) &&
+		(VRB_EP_PROTO(info) == FI_PROTO_RDMA_CM_IB_XRC);
+}
+
+static inline int vrb_is_xrc_ep(struct vrb_ep *ep)
+{
+	return (ep->util_ep.type == FI_EP_MSG) &&
+		(ep->info_attr.protocol == FI_PROTO_RDMA_CM_IB_XRC);
+}
+
 int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
 		   struct fid_ep **ep, void *context);
 int vrb_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
 		      struct fid_pep **pep, void *context);
-int vrb_create_ep(const struct fi_info *hints, enum rdma_port_space ps,
+int vrb_create_ep(struct vrb_ep *ep, enum rdma_port_space ps,
 		     struct rdma_cm_id **id);
 int vrb_dgram_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
 			 struct fid_av **av_fid, void *context);
@@ -690,6 +722,17 @@ struct vrb_connreq {
 	struct vrb_xrc_conn_info	xrc;
 };
 
+/* Structure below is a copy of the RDMA CM header (structure ib_connect_hdr in
+ * file librdmacm/cma.h)
+ * DO NOT MODIFY! */
+struct vrb_rdma_cm_hdr {
+	uint8_t  cma_version; /* Set by the kernel */
+	uint8_t  ip_version; /*  IP version: 7:4 */
+	uint16_t port;
+	uint32_t src_addr[4];
+	uint32_t dst_addr[4];
+};
+
 struct vrb_cm_data_hdr {
 	uint8_t	size;
 	char	data[];
@@ -745,8 +788,6 @@ void vrb_ep_tgt_conn_done(struct vrb_xrc_ep *qp);
 int vrb_ep_destroy_xrc_qp(struct vrb_xrc_ep *ep);
 
 int vrb_xrc_close_srq(struct vrb_srq_ep *srq_ep);
-int vrb_sockaddr_len(struct sockaddr *addr);
-
 
 int vrb_init_info(const struct fi_info **all_infos);
 int vrb_getinfo(uint32_t version, const char *node, const char *service,
@@ -754,13 +795,13 @@ int vrb_getinfo(uint32_t version, const char *node, const char *service,
 		   struct fi_info **info);
 const struct fi_info *vrb_get_verbs_info(const struct fi_info *ilist,
 					    const char *domain_name);
-int vrb_fi_to_rai(const struct fi_info *fi, uint64_t flags,
-		     struct rdma_addrinfo *rai);
-int vrb_get_rdma_rai(const char *node, const char *service, uint64_t flags,
-			const struct fi_info *hints, struct rdma_addrinfo **rai);
+int vrb_set_rai(uint32_t addr_format, void *src_addr, size_t src_addrlen,
+		void *dest_addr, size_t dest_addrlen, uint64_t flags,
+		struct rdma_addrinfo *rai);
 int vrb_get_matching_info(uint32_t version, const struct fi_info *hints,
 			     struct fi_info **info, const struct fi_info *verbs_info,
 			     uint8_t passive);
+int vrb_get_port_space(uint32_t addr_format);
 void vrb_alter_info(const struct fi_info *hints, struct fi_info *info);
 
 struct verbs_ep_domain {
@@ -841,10 +882,10 @@ static inline ssize_t vrb_convert_ret(int ret)
 int vrb_poll_cq(struct vrb_cq *cq, struct ibv_wc *wc);
 int vrb_save_wc(struct vrb_cq *cq, struct ibv_wc *wc);
 
-#define vrb_init_sge(buf, len, desc) (struct ibv_sge)		\
-	{ .addr = (uintptr_t)buf,					\
-	  .length = (uint32_t)len,					\
-	  .lkey = (uint32_t)(uintptr_t)desc }
+#define vrb_init_sge(buf, len, desc) (struct ibv_sge)	\
+	{ .addr = (uintptr_t) buf,			\
+	  .length = (uint32_t) len,			\
+	  .lkey = (uint32_t) (uintptr_t) desc }
 
 #define vrb_set_sge_iov(sg_list, iov, count, desc)	\
 ({							\
@@ -887,14 +928,17 @@ int vrb_save_wc(struct vrb_cq *cq, struct ibv_wc *wc);
 
 #define vrb_send_iov(ep, wr, iov, desc, count)		\
 	vrb_send_iov_flags(ep, wr, iov, desc, count,		\
-			      (ep)->info->tx_attr->op_flags)
+			      (ep)->util_ep.tx_op_flags)
 
 #define vrb_send_msg(ep, wr, msg, flags)				\
 	vrb_send_iov_flags(ep, wr, (msg)->msg_iov, (msg)->desc,	\
 			      (msg)->iov_count, flags)
 
+#define vrb_wr_consumes_recv(wr)						\
+	( wr->opcode == IBV_WR_SEND || wr->opcode == IBV_WR_SEND_WITH_IMM	\
+	|| wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM )
 
-ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr);
+ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr, uint64_t flags);
 ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr);
 
 static inline ssize_t
@@ -908,7 +952,7 @@ vrb_send_buf(struct vrb_ep *ep, struct ibv_send_wr *wr,
 	wr->sg_list = &sge;
 	wr->num_sge = 1;
 
-	return vrb_post_send(ep, wr);
+	return vrb_post_send(ep, wr, 0);
 }
 
 static inline ssize_t
@@ -922,7 +966,7 @@ vrb_send_buf_inline(struct vrb_ep *ep, struct ibv_send_wr *wr,
 	wr->sg_list = &sge;
 	wr->num_sge = 1;
 
-	return vrb_post_send(ep, wr);
+	return vrb_post_send(ep, wr, 0);
 }
 
 static inline ssize_t
@@ -944,9 +988,11 @@ vrb_send_iov_flags(struct vrb_ep *ep, struct ibv_send_wr *wr,
 	if (flags & FI_FENCE)
 		wr->send_flags |= IBV_SEND_FENCE;
 
-	return vrb_post_send(ep, wr);
+	return vrb_post_send(ep, wr, flags);
 }
 
+void vrb_add_credits(struct fid_ep *ep, size_t credits);
+
 int vrb_get_rai_id(const char *node, const char *service, uint64_t flags,
 		      const struct fi_info *hints, struct rdma_addrinfo **rai,
 		      struct rdma_cm_id **id);
diff --git a/deps/libfabric/prov/verbs/src/ofi_verbs_priv.h b/deps/libfabric/prov/verbs/src/ofi_verbs_priv.h
index 28b617d6d71757137cd9788536c40d726f2144e5..88b5c11314562cb762a3510c43007be36d97eccb 100644
--- a/deps/libfabric/prov/verbs/src/ofi_verbs_priv.h
+++ b/deps/libfabric/prov/verbs/src/ofi_verbs_priv.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2018 Cray Inc.  All rights reserved.
+ * Copyright (c) 2019 System Fabric Works, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
diff --git a/deps/libfabric/prov/verbs/src/verbs_cm.c b/deps/libfabric/prov/verbs/src/verbs_cm.c
index 174682eac643da85810644becf0beffecfdca82e..f341c8738275693a070288bd8d909c52ef85b4d8 100644
--- a/deps/libfabric/prov/verbs/src/verbs_cm.c
+++ b/deps/libfabric/prov/verbs/src/verbs_cm.c
@@ -37,7 +37,7 @@
 
 static int vrb_copy_addr(void *dst_addr, size_t *dst_addrlen, void *src_addr)
 {
-	size_t src_addrlen = vrb_sockaddr_len(src_addr);
+	size_t src_addrlen = ofi_sizeofaddr(src_addr);
 
 	if (*dst_addrlen == 0) {
 		*dst_addrlen = src_addrlen;
@@ -61,24 +61,24 @@ static int vrb_msg_ep_setname(fid_t ep_fid, void *addr, size_t addrlen)
 	struct vrb_ep *ep =
 		container_of(ep_fid, struct vrb_ep, util_ep.ep_fid);
 
-	if (addrlen != ep->info->src_addrlen) {
+	if (addrlen != ep->info_attr.src_addrlen) {
 		VERBS_INFO(FI_LOG_EP_CTRL,"addrlen expected: %zu, got: %zu.\n",
-			   ep->info->src_addrlen, addrlen);
+			   ep->info_attr.src_addrlen, addrlen);
 		return -FI_EINVAL;
 	}
 
-	save_addr = ep->info->src_addr;
+	save_addr = ep->info_attr.src_addr;
 
-	ep->info->src_addr = malloc(ep->info->src_addrlen);
-	if (!ep->info->src_addr) {
+	ep->info_attr.src_addr = malloc(ep->info_attr.src_addrlen);
+	if (!ep->info_attr.src_addr) {
 		VERBS_WARN(FI_LOG_EP_CTRL, "memory allocation failure\n");
 		ret = -FI_ENOMEM;
 		goto err1;
 	}
 
-	memcpy(ep->info->src_addr, addr, ep->info->src_addrlen);
+	memcpy(ep->info_attr.src_addr, addr, ep->info_attr.src_addrlen);
 
-	ret = vrb_create_ep(ep->info, RDMA_PS_TCP, &id);
+	ret = vrb_create_ep(ep, RDMA_PS_TCP, &id);
 	if (ret)
 		goto err2;
 
@@ -91,17 +91,16 @@ static int vrb_msg_ep_setname(fid_t ep_fid, void *addr, size_t addrlen)
 
 	return 0;
 err2:
-	free(ep->info->src_addr);
+	free(ep->info_attr.src_addr);
 err1:
-	ep->info->src_addr = save_addr;
+	ep->info_attr.src_addr = save_addr;
 	return ret;
 }
 
 static int vrb_msg_ep_getname(fid_t ep, void *addr, size_t *addrlen)
 {
 	struct sockaddr *sa;
-	struct vrb_ep *_ep =
-		container_of(ep, struct vrb_ep, util_ep.ep_fid);
+	struct vrb_ep *_ep = container_of(ep, struct vrb_ep, util_ep.ep_fid);
 	sa = rdma_get_local_addr(_ep->id);
 	return vrb_copy_addr(addr, addrlen, sa);
 }
@@ -109,8 +108,7 @@ static int vrb_msg_ep_getname(fid_t ep, void *addr, size_t *addrlen)
 static int vrb_msg_ep_getpeer(struct fid_ep *ep, void *addr, size_t *addrlen)
 {
 	struct sockaddr *sa;
-	struct vrb_ep *_ep =
-		container_of(ep, struct vrb_ep, util_ep.ep_fid);
+	struct vrb_ep *_ep = container_of(ep, struct vrb_ep, util_ep.ep_fid);
 	sa = rdma_get_peer_addr(_ep->id);
 	return vrb_copy_addr(addr, addrlen, sa);
 }
@@ -125,23 +123,41 @@ vrb_msg_ep_prepare_cm_data(const void *param, size_t param_size,
 
 static inline void
 vrb_ep_prepare_rdma_cm_param(struct rdma_conn_param *conn_param,
-				struct vrb_cm_data_hdr *cm_hdr,
-				size_t cm_hdr_data_size)
+				void *priv_data, size_t priv_data_size)
 {
-	conn_param->private_data = cm_hdr;
-	conn_param->private_data_len = (uint8_t)cm_hdr_data_size;
+	conn_param->private_data = priv_data;
+	conn_param->private_data_len = (uint8_t)priv_data_size;
 	conn_param->responder_resources = RDMA_MAX_RESP_RES;
 	conn_param->initiator_depth = RDMA_MAX_INIT_DEPTH;
 	conn_param->flow_control = 1;
 	conn_param->rnr_retry_count = 7;
 }
 
+static void
+vrb_msg_ep_prepare_rdma_cm_hdr(void *priv_data,
+				const struct rdma_cm_id *id)
+{
+	struct vrb_rdma_cm_hdr *rdma_cm_hdr = priv_data;
+
+	rdma_cm_hdr->ip_version = 6 << 4; /* IPv6 */
+	rdma_cm_hdr->port = htons(ofi_addr_get_port(&id->route.addr.src_addr));
+
+	/* Record the GIDs */
+	memcpy(rdma_cm_hdr->src_addr,
+		   &((struct ofi_sockaddr_ib *)&id->route.addr.src_addr)->sib_addr, 16);
+	memcpy(rdma_cm_hdr->dst_addr,
+		   &((struct ofi_sockaddr_ib *)&id->route.addr.dst_addr)->sib_addr, 16);
+}
+
 static int
 vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr,
 		      const void *param, size_t paramlen)
 {
 	struct vrb_ep *ep =
 		container_of(ep_fid, struct vrb_ep, util_ep.ep_fid);
+	size_t priv_data_len;
+	struct vrb_cm_data_hdr *cm_hdr;
+	off_t rdma_cm_hdr_len = 0;
 	int ret;
 
 	if (OFI_UNLIKELY(paramlen > VERBS_CM_DATA_SIZE))
@@ -153,13 +169,21 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr,
 			return ret;
 	}
 
-	ep->cm_hdr = malloc(sizeof(*(ep->cm_hdr)) + paramlen);
-	if (!ep->cm_hdr)
+	if (ep->id->route.addr.src_addr.sa_family == AF_IB)
+		rdma_cm_hdr_len = sizeof(struct vrb_rdma_cm_hdr);
+
+	priv_data_len = sizeof(*cm_hdr) + paramlen + rdma_cm_hdr_len;
+	ep->cm_priv_data = malloc(priv_data_len);
+	if (!ep->cm_priv_data)
 		return -FI_ENOMEM;
 
-	vrb_msg_ep_prepare_cm_data(param, paramlen, ep->cm_hdr);
-	vrb_ep_prepare_rdma_cm_param(&ep->conn_param, ep->cm_hdr,
-					sizeof(*(ep->cm_hdr)) + paramlen);
+	if (rdma_cm_hdr_len)
+		vrb_msg_ep_prepare_rdma_cm_hdr(ep->cm_priv_data, ep->id);
+
+	cm_hdr = (void *)((char *)ep->cm_priv_data + rdma_cm_hdr_len);
+	vrb_msg_ep_prepare_cm_data(param, paramlen, cm_hdr);
+	vrb_ep_prepare_rdma_cm_param(&ep->conn_param, ep->cm_priv_data,
+					priv_data_len);
 	ep->conn_param.retry_count = 15;
 
 	if (ep->srq_ep)
@@ -170,8 +194,8 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr,
 		FI_WARN(&vrb_prov, FI_LOG_EP_CTRL,
 			"rdma_resolve_route failed: %s (%d)\n",
 			strerror(-ret), -ret);
-		free(ep->cm_hdr);
-		ep->cm_hdr = NULL;
+		free(ep->cm_priv_data);
+		ep->cm_priv_data = NULL;
 		return ret;
 	}
 	return 0;
@@ -208,7 +232,7 @@ vrb_msg_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen)
 	if (ret)
 		return -errno;
 
-	connreq = container_of(_ep->info->handle, struct vrb_connreq, handle);
+	connreq = container_of(_ep->info_attr.handle, struct vrb_connreq, handle);
 	free(connreq);
 
 	return 0;
@@ -318,7 +342,7 @@ vrb_msg_xrc_cm_common_verify(struct vrb_xrc_ep *ep, size_t paramlen)
 {
 	int ret;
 
-	if (!vrb_is_xrc(ep->base_ep.info)) {
+	if (!vrb_is_xrc_ep(&ep->base_ep)) {
 		VERBS_WARN(FI_LOG_EP_CTRL, "EP is not using XRC\n");
 		return -FI_EINVAL;
 	}
@@ -487,15 +511,14 @@ static int vrb_pep_listen(struct fid_pep *pep_fid)
 	pep = container_of(pep_fid, struct vrb_pep, pep_fid);
 
 	addr = rdma_get_local_addr(pep->id);
-	if (addr)
-		ofi_straddr_log(&vrb_prov, FI_LOG_INFO,
-				FI_LOG_EP_CTRL, "listening on", addr);
+	ofi_straddr_log(&vrb_prov, FI_LOG_INFO,
+			FI_LOG_EP_CTRL, "listening on", addr);
 
 	ret = rdma_listen(pep->id, pep->backlog);
 	if (ret)
 		return -errno;
 
-	if (vrb_is_xrc(pep->info)) {
+	if (vrb_is_xrc_info(pep->info)) {
 		ret = rdma_listen(pep->xrc_ps_udp_id, pep->backlog);
 		if (ret)
 			ret = -errno;
diff --git a/deps/libfabric/prov/verbs/src/verbs_cm_xrc.c b/deps/libfabric/prov/verbs/src/verbs_cm_xrc.c
index 8e49e411256d563bce86829f0d825dfd79fbb5ea..c88bd8fbce62e5f706a09e7b05fe62fc0461573a 100644
--- a/deps/libfabric/prov/verbs/src/verbs_cm_xrc.c
+++ b/deps/libfabric/prov/verbs/src/verbs_cm_xrc.c
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2018 Cray Inc. All rights reserved.
+ * Copyright (c) 2018-2019 Cray Inc. All rights reserved.
+ * Copyright (c) 2018-2019 System Fabric Works, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -127,7 +128,7 @@ void vrb_log_ep_conn(struct vrb_xrc_ep *ep, char *desc)
 {
 	struct sockaddr *addr;
 	char buf[OFI_ADDRSTRLEN];
-	size_t len = sizeof(buf);
+	size_t len;
 
 	if (!fi_log_enabled(&vrb_prov, FI_LOG_INFO, FI_LOG_EP_CTRL))
 		return;
@@ -140,20 +141,14 @@ void vrb_log_ep_conn(struct vrb_xrc_ep *ep, char *desc)
 
 	if (ep->base_ep.id) {
 		addr = rdma_get_local_addr(ep->base_ep.id);
-		if (addr) {
-			ofi_straddr(buf, &len, ep->base_ep.info->addr_format,
-				    addr);
-			VERBS_INFO(FI_LOG_EP_CTRL, "EP %p src_addr: %s\n",
-				   ep, buf);
-		}
+		len = sizeof(buf);
+		ofi_straddr(buf, &len, ep->base_ep.info_attr.addr_format, addr);
+		VERBS_INFO(FI_LOG_EP_CTRL, "EP %p src_addr: %s\n", ep, buf);
+
 		addr = rdma_get_peer_addr(ep->base_ep.id);
-		if (addr) {
-			len = sizeof(buf);
-			ofi_straddr(buf, &len, ep->base_ep.info->addr_format,
-				    addr);
-			VERBS_INFO(FI_LOG_EP_CTRL, "EP %p dst_addr: %s\n",
-				   ep, buf);
-		}
+		len = sizeof(buf);
+		ofi_straddr(buf, &len, ep->base_ep.info_attr.addr_format, addr);
+		VERBS_INFO(FI_LOG_EP_CTRL, "EP %p dst_addr: %s\n", ep, buf);
 	}
 
 	if (ep->base_ep.ibv_qp) {
@@ -197,6 +192,9 @@ void vrb_free_xrc_conn_setup(struct vrb_xrc_ep *ep, int disconnect)
 	if (!disconnect) {
 		free(ep->conn_setup);
 		ep->conn_setup = NULL;
+		free(ep->base_ep.info_attr.src_addr);
+		ep->base_ep.info_attr.src_addr = NULL;
+		ep->base_ep.info_attr.src_addrlen = 0;
 	}
 }
 
@@ -204,10 +202,12 @@ void vrb_free_xrc_conn_setup(struct vrb_xrc_ep *ep, int disconnect)
 int vrb_connect_xrc(struct vrb_xrc_ep *ep, struct sockaddr *addr,
 		       int reciprocal, void *param, size_t paramlen)
 {
+	struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep);
 	int ret;
 
 	assert(!ep->base_ep.id && !ep->base_ep.ibv_qp && !ep->ini_conn);
 
+	domain->xrc.lock_acquire(&domain->xrc.ini_lock);
 	ret = vrb_get_shared_ini_conn(ep, &ep->ini_conn);
 	if (ret) {
 		VERBS_WARN(FI_LOG_EP_CTRL,
@@ -216,12 +216,14 @@ int vrb_connect_xrc(struct vrb_xrc_ep *ep, struct sockaddr *addr,
 			free(ep->conn_setup);
 			ep->conn_setup = NULL;
 		}
+		domain->xrc.lock_release(&domain->xrc.ini_lock);
 		return ret;
 	}
 
 	vrb_eq_set_xrc_conn_tag(ep);
 	vrb_add_pending_ini_conn(ep, reciprocal, param, paramlen);
 	vrb_sched_ini_conn(ep->ini_conn);
+	domain->xrc.lock_release(&domain->xrc.ini_lock);
 
 	return FI_SUCCESS;
 }
@@ -229,8 +231,11 @@ int vrb_connect_xrc(struct vrb_xrc_ep *ep, struct sockaddr *addr,
 /* Caller must hold the eq:lock */
 void vrb_ep_ini_conn_done(struct vrb_xrc_ep *ep, uint32_t tgt_qpn)
 {
+	struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep);
+
 	assert(ep->base_ep.id && ep->ini_conn);
 
+	domain->xrc.lock_acquire(&domain->xrc.ini_lock);
 	assert(ep->ini_conn->state == VRB_INI_QP_CONNECTING ||
 	       ep->ini_conn->state == VRB_INI_QP_CONNECTED);
 
@@ -250,6 +255,7 @@ void vrb_ep_ini_conn_done(struct vrb_xrc_ep *ep, uint32_t tgt_qpn)
 
 	vrb_log_ep_conn(ep, "INI Connection Done");
 	vrb_sched_ini_conn(ep->ini_conn);
+	domain->xrc.lock_release(&domain->xrc.ini_lock);
 }
 
 /* Caller must hold the eq:lock */
@@ -320,7 +326,7 @@ int vrb_accept_xrc(struct vrb_xrc_ep *ep, int reciprocal,
 	if (addr)
 		ofi_straddr_dbg(&vrb_prov, FI_LOG_CORE, "dest_addr", addr);
 
-	connreq = container_of(ep->base_ep.info->handle,
+	connreq = container_of(ep->base_ep.info_attr.handle,
 			       struct vrb_connreq, handle);
 	ret = vrb_ep_create_tgt_qp(ep, connreq->xrc.tgt_qpn);
 	if (ret)
@@ -390,8 +396,8 @@ int vrb_process_xrc_connreq(struct vrb_ep *ep,
 	struct vrb_xrc_ep *xrc_ep = container_of(ep, struct vrb_xrc_ep,
 						    base_ep);
 
-	assert(ep->info->src_addr);
-	assert(ep->info->dest_addr);
+	assert(ep->info_attr.src_addr);
+	assert(ep->info_attr.dest_addr);
 
 	xrc_ep->conn_setup = calloc(1, sizeof(*xrc_ep->conn_setup));
 	if (!xrc_ep->conn_setup) {
@@ -404,8 +410,8 @@ int vrb_process_xrc_connreq(struct vrb_ep *ep,
 	/* This endpoint was created on the passive side of a connection
 	 * request. The reciprocal connection request will go back to the
 	 * passive port indicated by the active side */
-	ofi_addr_set_port(ep->info->src_addr, 0);
-	ofi_addr_set_port(ep->info->dest_addr, connreq->xrc.port);
+	ofi_addr_set_port(ep->info_attr.src_addr, 0);
+	ofi_addr_set_port(ep->info_attr.dest_addr, connreq->xrc.port);
 	xrc_ep->tgt_id = connreq->id;
 	xrc_ep->tgt_id->context = &ep->util_ep.ep_fid.fid;
 
diff --git a/deps/libfabric/prov/verbs/src/verbs_cq.c b/deps/libfabric/prov/verbs/src/verbs_cq.c
index a147898ee7d6da1a71dc19d359727edda0ab4b49..14d91bab908eed2eb1f5e26449b194ad6e255f9b 100644
--- a/deps/libfabric/prov/verbs/src/verbs_cq.c
+++ b/deps/libfabric/prov/verbs/src/verbs_cq.c
@@ -241,7 +241,7 @@ int vrb_poll_cq(struct vrb_cq *cq, struct ibv_wc *wc)
 		wc->wr_id = (uintptr_t) ctx->user_ctx;
 		if (ctx->flags & FI_TRANSMIT) {
 			cq->credits++;
-			ctx->ep->tx_credits++;
+			ctx->ep->sq_credits++;
 		}
 
 		if (wc->status) {
diff --git a/deps/libfabric/prov/verbs/src/verbs_dgram_ep_msg.c b/deps/libfabric/prov/verbs/src/verbs_dgram_ep_msg.c
index ed91ff3e247dde4750bb092b68ffae842dde8359..d71b575109098b0d75175dc695f7268af908da9c 100644
--- a/deps/libfabric/prov/verbs/src/verbs_dgram_ep_msg.c
+++ b/deps/libfabric/prov/verbs/src/verbs_dgram_ep_msg.c
@@ -206,7 +206,7 @@ vrb_dgram_ep_injectdata_fast(struct fid_ep *ep_fid, const void *buf, size_t len,
 	if (vrb_dgram_ep_set_addr(ep, dest_addr, &ep->wrs->msg_wr))
 		return -FI_ENOENT;
 
-	ret = vrb_post_send(ep, &ep->wrs->msg_wr);
+	ret = vrb_post_send(ep, &ep->wrs->msg_wr, 0);
 	ep->wrs->msg_wr.opcode = IBV_WR_SEND;
 	return ret;
 }
@@ -242,7 +242,7 @@ vrb_dgram_ep_inject_fast(struct fid_ep *ep_fid, const void *buf, size_t len,
 	if (vrb_dgram_ep_set_addr(ep, dest_addr, &ep->wrs->msg_wr))
 		return -FI_ENOENT;
 
-	return vrb_post_send(ep, &ep->wrs->msg_wr);
+	return vrb_post_send(ep, &ep->wrs->msg_wr, 0);
 }
 
 const struct fi_ops_msg vrb_dgram_msg_ops = {
diff --git a/deps/libfabric/prov/verbs/src/verbs_domain.c b/deps/libfabric/prov/verbs/src/verbs_domain.c
index 1323304d7202d6fa80df69609ae7a40e5346b777..b64f96d096b951116c9b9ba01390a25cb2614f75 100644
--- a/deps/libfabric/prov/verbs/src/verbs_domain.c
+++ b/deps/libfabric/prov/verbs/src/verbs_domain.c
@@ -38,6 +38,59 @@
 #include <malloc.h>
 
 
+
+static void vrb_set_threshold(struct fid_ep *ep_fid, size_t threshold)
+{
+	struct vrb_ep *ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid);
+	ep->threshold = threshold;
+}
+
+static void vrb_set_credit_handler(struct fid_domain *domain_fid,
+		ssize_t (*credit_handler)(struct fid_ep *ep, size_t credits))
+{
+	struct vrb_domain *domain;
+
+	domain = container_of(domain_fid, struct vrb_domain,
+			      util_domain.domain_fid.fid);
+	domain->send_credits = credit_handler;
+}
+
+static int vrb_enable_ep_flow_ctrl(struct fid_ep *ep_fid)
+{
+	struct vrb_ep *ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid);
+	// only enable if we are not using SRQ
+	if (!ep->srq_ep && ep->ibv_qp && ep->ibv_qp->qp_type == IBV_QPT_RC) {
+		ep->peer_rq_credits = 1;
+		return FI_SUCCESS;
+	}
+
+	return -FI_ENOSYS;
+}
+
+struct ofi_ops_flow_ctrl vrb_ops_flow_ctrl = {
+	.size = sizeof(struct ofi_ops_flow_ctrl),
+	.set_threshold = vrb_set_threshold,
+	.add_credits = vrb_add_credits,
+	.enable = vrb_enable_ep_flow_ctrl,
+	.set_send_handler = vrb_set_credit_handler,
+};
+
+static int
+vrb_domain_ops_open(struct fid *fid, const char *name, uint64_t flags,
+		    void **ops, void *context)
+{
+	if (flags)
+		return -FI_EBADFLAGS;
+
+	if (!strcasecmp(name, OFI_OPS_FLOW_CTRL)) {
+		*ops = &vrb_ops_flow_ctrl;
+		return 0;
+	}
+
+	return -FI_ENOSYS;
+}
+
+
 #if VERBS_HAVE_QUERY_EX
 static int vrb_odp_flag(struct ibv_context *verbs)
 {
@@ -190,7 +243,7 @@ static struct fi_ops vrb_fid_ops = {
 	.close = vrb_domain_close,
 	.bind = vrb_domain_bind,
 	.control = fi_no_control,
-	.ops_open = fi_no_ops_open,
+	.ops_open = vrb_domain_ops_open,
 };
 
 static struct fi_ops_domain vrb_msg_domain_ops = {
@@ -225,6 +278,9 @@ static int
 vrb_domain(struct fid_fabric *fabric, struct fi_info *info,
 	      struct fid_domain **domain, void *context)
 {
+	struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = {
+		[FI_HMEM_SYSTEM] = default_monitor,
+	};
 	struct vrb_domain *_domain;
 	int ret;
 	struct vrb_fabric *fab =
@@ -253,7 +309,7 @@ vrb_domain(struct fid_fabric *fabric, struct fi_info *info,
 		goto err2;
 
 	_domain->ep_type = VRB_EP_TYPE(info);
-	_domain->flags |= vrb_is_xrc(info) ? VRB_USE_XRC : 0;
+	_domain->flags |= vrb_is_xrc_info(info) ? VRB_USE_XRC : 0;
 
 	ret = vrb_open_device_by_name(_domain, info->domain_attr->name);
 	if (ret)
@@ -273,7 +329,7 @@ vrb_domain(struct fid_fabric *fabric, struct fi_info *info,
 	_domain->cache.entry_data_size = sizeof(struct vrb_mem_desc);
 	_domain->cache.add_region = vrb_mr_cache_add_region;
 	_domain->cache.delete_region = vrb_mr_cache_delete_region;
-	ret = ofi_mr_cache_init(&_domain->util_domain, default_monitor,
+	ret = ofi_mr_cache_init(&_domain->util_domain, memory_monitors,
 				&_domain->cache);
 	if (!ret)
 		_domain->util_domain.domain_fid.mr = &vrb_mr_cache_ops;
diff --git a/deps/libfabric/prov/verbs/src/verbs_domain_xrc.c b/deps/libfabric/prov/verbs/src/verbs_domain_xrc.c
index d4408976032ab4bf83d1991bfb43cc9f4b74ea13..70f39a6de66c7cd66214a8b33eac19c6047befd2 100644
--- a/deps/libfabric/prov/verbs/src/verbs_domain_xrc.c
+++ b/deps/libfabric/prov/verbs/src/verbs_domain_xrc.c
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2018 Cray Inc. All rights reserved.
+ * Copyright (c) 2018-2019 Cray Inc. All rights reserved.
+ * Copyright (c) 2018-2019 System Fabric Works, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -106,12 +107,12 @@ static int vrb_create_ini_qp(struct vrb_xrc_ep *ep)
 static inline void vrb_set_ini_conn_key(struct vrb_xrc_ep *ep,
 					   struct vrb_ini_conn_key *key)
 {
-	key->addr = ep->base_ep.info->dest_addr;
+	key->addr = ep->base_ep.info_attr.dest_addr;
 	key->tx_cq = container_of(ep->base_ep.util_ep.tx_cq,
 				  struct vrb_cq, util_cq);
 }
 
-/* Caller must hold domain:eq:lock */
+/* Caller must hold domain:xrc.ini_lock */
 int vrb_get_shared_ini_conn(struct vrb_xrc_ep *ep,
 			       struct vrb_ini_shared_conn **ini_conn) {
 	struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep);
@@ -167,8 +168,8 @@ insert_err:
 	return ret;
 }
 
-/* Caller must hold domain:eq:lock */
-void vrb_put_shared_ini_conn(struct vrb_xrc_ep *ep)
+/* Caller must hold domain:xrc.ini_lock */
+void _vrb_put_shared_ini_conn(struct vrb_xrc_ep *ep)
 {
 	struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep);
 	struct vrb_ini_shared_conn *ini_conn;
@@ -212,7 +213,16 @@ void vrb_put_shared_ini_conn(struct vrb_xrc_ep *ep)
 	}
 }
 
-/* Caller must hold domain:eq:lock */
+void vrb_put_shared_ini_conn(struct vrb_xrc_ep *ep)
+{
+	struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep);
+
+	domain->xrc.lock_acquire(&domain->xrc.ini_lock);
+	_vrb_put_shared_ini_conn(ep);
+	domain->xrc.lock_release(&domain->xrc.ini_lock);
+}
+
+/* Caller must hold domain:xrc.ini_lock */
 void vrb_add_pending_ini_conn(struct vrb_xrc_ep *ep, int reciprocal,
 				 void *conn_param, size_t conn_paramlen)
 {
@@ -237,12 +247,11 @@ static void vrb_create_shutdown_event(struct vrb_xrc_ep *ep)
 		dlistfd_insert_tail(&eq_entry->item, &ep->base_ep.eq->list_head);
 }
 
-/* Caller must hold domain:eq:lock */
+/* Caller must hold domain:xrc.ini_lock */
 void vrb_sched_ini_conn(struct vrb_ini_shared_conn *ini_conn)
 {
 	struct vrb_xrc_ep *ep;
 	enum vrb_ini_qp_state last_state;
-	struct sockaddr *addr;
 	int ret;
 
 	/* Continue to schedule shared connections if the physical connection
@@ -261,7 +270,7 @@ void vrb_sched_ini_conn(struct vrb_ini_shared_conn *ini_conn)
 				  &ep->ini_conn->active_list);
 		last_state = ep->ini_conn->state;
 
-		ret = vrb_create_ep(ep->base_ep.info,
+		ret = vrb_create_ep(&ep->base_ep,
 				       last_state == VRB_INI_QP_UNCONNECTED ?
 				       RDMA_PS_TCP : RDMA_PS_UDP,
 				       &ep->base_ep.id);
@@ -305,14 +314,10 @@ void vrb_sched_ini_conn(struct vrb_ini_shared_conn *ini_conn)
 			goto err;
 		}
 
-		addr = rdma_get_local_addr(ep->base_ep.id);
-		if (addr)
-			ofi_straddr_dbg(&vrb_prov, FI_LOG_EP_CTRL,
-					"XRC connect src_addr", addr);
-		addr = rdma_get_peer_addr(ep->base_ep.id);
-		if (addr)
-			ofi_straddr_dbg(&vrb_prov, FI_LOG_EP_CTRL,
-					"XRC connect dest_addr", addr);
+		ofi_straddr_dbg(&vrb_prov, FI_LOG_EP_CTRL, "XRC connect src_addr",
+				rdma_get_local_addr(ep->base_ep.id));
+		ofi_straddr_dbg(&vrb_prov, FI_LOG_EP_CTRL, "XRC connect dest_addr",
+				rdma_get_peer_addr(ep->base_ep.id));
 
 		ep->base_ep.ibv_qp = ep->ini_conn->ini_qp;
 		ret = vrb_process_ini_conn(ep, ep->conn_setup->pending_recip,
@@ -321,7 +326,7 @@ void vrb_sched_ini_conn(struct vrb_ini_shared_conn *ini_conn)
 err:
 		if (ret) {
 			ep->ini_conn->state = last_state;
-			vrb_put_shared_ini_conn(ep);
+			_vrb_put_shared_ini_conn(ep);
 
 			/* We need to let the application know that the
 			 * connect request has failed. */
@@ -559,6 +564,14 @@ int vrb_domain_xrc_init(struct vrb_domain *domain)
 		goto rbmap_err;
 	}
 
+	fastlock_init(&domain->xrc.ini_lock);
+	if (domain->util_domain.threading == FI_THREAD_DOMAIN) {
+		domain->xrc.lock_acquire = ofi_fastlock_acquire_noop;
+		domain->xrc.lock_release = ofi_fastlock_release_noop;
+	} else {
+		domain->xrc.lock_acquire = ofi_fastlock_acquire;
+		domain->xrc.lock_release = ofi_fastlock_release;
+	}
 	domain->flags |= VRB_USE_XRC;
 	return FI_SUCCESS;
 
@@ -581,7 +594,6 @@ int vrb_domain_xrc_cleanup(struct vrb_domain *domain)
 	int ret;
 
 	assert(domain->xrc.xrcd);
-
 	/* All endpoint and hence XRC INI QP should be closed */
 	if (!ofi_rbmap_empty(domain->xrc.ini_conn_rbmap)) {
 		VERBS_WARN(FI_LOG_DOMAIN, "XRC domain busy\n");
@@ -599,6 +611,7 @@ int vrb_domain_xrc_cleanup(struct vrb_domain *domain)
 	}
 
 	ofi_rbmap_destroy(domain->xrc.ini_conn_rbmap);
+	fastlock_destroy(&domain->xrc.ini_lock);
 #endif /* VERBS_HAVE_XRC */
 	return 0;
 }
diff --git a/deps/libfabric/prov/verbs/src/verbs_ep.c b/deps/libfabric/prov/verbs/src/verbs_ep.c
index a2df0a272a9d9ecef069b2a3a3ced43ef6f3bb94..a892677cf976cee6988fbc254826260df0aa54f2 100644
--- a/deps/libfabric/prov/verbs/src/verbs_ep.c
+++ b/deps/libfabric/prov/verbs/src/verbs_ep.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2013-2018 Intel Corporation, Inc.  All rights reserved.
+ * Copyright (c) 2019 System Fabric Works, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -37,15 +38,32 @@
 static struct fi_ops_msg vrb_srq_msg_ops;
 
 
+void vrb_add_credits(struct fid_ep *ep_fid, size_t credits)
+{
+	struct vrb_ep *ep;
+	struct util_cq *cq;
+
+	ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid);
+	cq = ep->util_ep.tx_cq;
+
+	cq->cq_fastlock_acquire(&cq->cq_lock);
+	ep->peer_rq_credits += credits;
+	cq->cq_fastlock_release(&cq->cq_lock);
+}
+
 /* Receive CQ credits are pre-allocated */
 ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr)
 {
+	struct vrb_domain *domain;
 	struct vrb_context *ctx;
 	struct vrb_cq *cq;
 	struct ibv_recv_wr *bad_wr;
+	uint64_t credits_to_give;
 	int ret;
 
 	cq = container_of(ep->util_ep.rx_cq, struct vrb_cq, util_cq);
+	domain = vrb_ep_to_domain(ep);
+
 	cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock);
 	ctx = ofi_buf_alloc(cq->ctx_pool);
 	if (!ctx)
@@ -60,7 +78,22 @@ ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr)
 	wr->wr_id = (uintptr_t) ctx->user_ctx;
 	if (ret)
 		goto freebuf;
+
+	if (++ep->rq_credits_avail >= ep->threshold) {
+		credits_to_give = ep->rq_credits_avail;
+		ep->rq_credits_avail = 0;
+	} else {
+		credits_to_give = 0;
+	}
 	cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock);
+
+	if (credits_to_give &&
+	    domain->send_credits(&ep->util_ep.ep_fid, credits_to_give)) {
+		cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock);
+		ep->rq_credits_avail += credits_to_give;
+		cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock);
+	}
+
 	return 0;
 
 freebuf:
@@ -70,42 +103,54 @@ unlock:
 	return -FI_EAGAIN;
 }
 
-ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr)
+ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr, uint64_t flags)
 {
 	struct vrb_context *ctx;
+	struct vrb_domain *domain;
 	struct vrb_cq *cq;
+	struct vrb_cq *cq_rx;
 	struct ibv_send_wr *bad_wr;
 	struct ibv_wc wc;
+	size_t credits_to_give = 0;
 	int ret;
 
 	cq = container_of(ep->util_ep.tx_cq, struct vrb_cq, util_cq);
+	domain = vrb_ep_to_domain(ep);
 	cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock);
 	ctx = ofi_buf_alloc(cq->ctx_pool);
 	if (!ctx)
 		goto unlock;
 
-	if (!cq->credits || !ep->tx_credits) {
+	if (!cq->credits || !ep->sq_credits || !ep->peer_rq_credits) {
 		ret = vrb_poll_cq(cq, &wc);
 		if (ret > 0)
 			vrb_save_wc(cq, &wc);
 
-		if (!cq->credits || !ep->tx_credits)
+		if (!cq->credits || !ep->sq_credits || !ep->peer_rq_credits) {
 			goto freebuf;
+		}
+	}
+
+	if (vrb_wr_consumes_recv(wr) && !--ep->peer_rq_credits &&
+	    !(flags & FI_PRIORITY)) {
+		/* Last credit is reserved for credit update */
+		ep->peer_rq_credits++;
+		goto freebuf;
 	}
 
 	cq->credits--;
-	ep->tx_credits--;
+	ep->sq_credits--;
 
 	ctx->ep = ep;
 	ctx->user_ctx = (void *) (uintptr_t) wr->wr_id;
-	ctx->flags = FI_TRANSMIT;
+	ctx->flags = FI_TRANSMIT | flags;
 	wr->wr_id = (uintptr_t) ctx;
 
 	ret = ibv_post_send(ep->ibv_qp, wr, &bad_wr);
 	wr->wr_id = (uintptr_t) ctx->user_ctx;
 	if (ret) {
-		VERBS_WARN(FI_LOG_EP_DATA,
-			   "Post send failed - %zd\n", vrb_convert_ret(ret));
+		VERBS_WARN(FI_LOG_EP_DATA, "Post send failed - %zd\n",
+			   vrb_convert_ret(ret));
 		goto credits;
 	}
 	cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock);
@@ -113,12 +158,27 @@ ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr)
 	return 0;
 
 credits:
+	if (vrb_wr_consumes_recv(wr))
+		ep->peer_rq_credits++;
 	cq->credits++;
-	ep->tx_credits++;
+	ep->sq_credits++;
 freebuf:
 	ofi_buf_free(ctx);
 unlock:
 	cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock);
+	cq_rx = container_of(ep->util_ep.rx_cq, struct vrb_cq, util_cq);
+	cq_rx->util_cq.cq_fastlock_acquire(&cq_rx->util_cq.cq_lock);
+	if (ep->rq_credits_avail >= ep->threshold) {
+		credits_to_give = ep->rq_credits_avail;
+		ep->rq_credits_avail = 0;
+	}
+	cq_rx->util_cq.cq_fastlock_release(&cq_rx->util_cq.cq_lock);
+	if (credits_to_give &&
+	    domain->send_credits(&ep->util_ep.ep_fid, credits_to_give)) {
+		cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock);
+		ep->rq_credits_avail += credits_to_give;
+		cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock);
+	}
 	return -FI_EAGAIN;
 }
 
@@ -126,21 +186,21 @@ static inline int vrb_msg_ep_cmdata_size(fid_t fid)
 {
 	struct vrb_pep *pep;
 	struct vrb_ep *ep;
-	struct fi_info *info;
+	bool is_xrc;
 
 	switch (fid->fclass) {
 	case FI_CLASS_PEP:
 		pep = container_of(fid, struct vrb_pep, pep_fid.fid);
-		info = pep->info;
+		is_xrc = vrb_is_xrc_info(pep->info);
 		break;
 	case FI_CLASS_EP:
 		ep = container_of(fid, struct vrb_ep, util_ep.ep_fid.fid);
-		info = ep->info;
+		is_xrc = vrb_is_xrc_ep(ep);
 		break;
 	default:
-		info = NULL;
+		is_xrc = 0;
 	};
-	if (vrb_is_xrc(info))
+	if (is_xrc)
 		return VERBS_CM_DATA_SIZE - sizeof(struct vrb_xrc_cm_data);
 	else
 		return VERBS_CM_DATA_SIZE;
@@ -243,7 +303,7 @@ vrb_alloc_init_ep(struct fi_info *info, struct vrb_domain *domain,
 	struct vrb_xrc_ep *xrc_ep;
 	int ret;
 
-	if (vrb_is_xrc(info)) {
+	if (vrb_is_xrc_info(info)) {
 		xrc_ep = calloc(1, sizeof(*xrc_ep));
 		if (!xrc_ep)
 			return NULL;
@@ -255,13 +315,9 @@ vrb_alloc_init_ep(struct fi_info *info, struct vrb_domain *domain,
 			return NULL;
 	}
 
-	ep->info = fi_dupinfo(info);
-	if (!ep->info)
-		goto err1;
-
 	if (domain->util_domain.threading != FI_THREAD_SAFE) {
 		if (vrb_alloc_wrs(ep))
-			goto err2;
+			goto err1;
 	}
 
 	ret = ofi_endpoint_init(&domain->util_domain.domain_fid, &vrb_util_prov, info,
@@ -269,20 +325,18 @@ vrb_alloc_init_ep(struct fi_info *info, struct vrb_domain *domain,
 	if (ret) {
 		VERBS_WARN(FI_LOG_EP_CTRL,
 			   "Unable to initialize EP, error - %d\n", ret);
-		goto err3;
+		goto err2;
 	}
 
 	ep->util_ep.ep_fid.msg = calloc(1, sizeof(*ep->util_ep.ep_fid.msg));
 	if (!ep->util_ep.ep_fid.msg)
-		goto err4;
+		goto err3;
 
 	return ep;
-err4:
-	(void) ofi_endpoint_close(&ep->util_ep);
 err3:
-	vrb_free_wrs(ep);
+	(void) ofi_endpoint_close(&ep->util_ep);
 err2:
-	fi_freeinfo(ep->info);
+	vrb_free_wrs(ep);
 err1:
 	free(ep);
 	return NULL;
@@ -295,7 +349,7 @@ static int vrb_close_free_ep(struct vrb_ep *ep)
 
 	free(ep->util_ep.ep_fid.msg);
 	ep->util_ep.ep_fid.msg = NULL;
-	free(ep->cm_hdr);
+	free(ep->cm_priv_data);
 
 	if (ep->util_ep.rx_cq) {
 		cq = container_of(ep->util_ep.rx_cq, struct vrb_cq, util_cq);
@@ -308,7 +362,8 @@ static int vrb_close_free_ep(struct vrb_ep *ep)
 		return ret;
 
 	vrb_free_wrs(ep);
-	fi_freeinfo(ep->info);
+	free(ep->info_attr.src_addr);
+	free(ep->info_attr.dest_addr);
 	free(ep);
 
 	return 0;
@@ -352,7 +407,7 @@ static int vrb_ep_close(fid_t fid)
 			vrb_eq_remove_events(ep->eq, fid);
 		}
 
-		if (vrb_is_xrc(ep->info))
+		if (vrb_is_xrc_ep(ep))
 			vrb_ep_xrc_close(ep);
 		else
 			rdma_destroy_ep(ep->id);
@@ -424,7 +479,7 @@ static int vrb_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
 				VERBS_WARN(FI_LOG_DOMAIN,
 					   "Rx CQ is fully reserved\n");
 				ep->rx_cq_size = 0;
-			} 
+			}
 			cq->credits -= ep->rx_cq_size;
 			cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock);
 		}
@@ -445,7 +500,7 @@ static int vrb_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
 
 		/* Make sure EQ channel is not polled during migrate */
 		fastlock_acquire(&ep->eq->lock);
-		if (vrb_is_xrc(ep->info))
+		if (vrb_is_xrc_ep(ep))
 			ret = vrb_ep_xrc_set_tgt_chan(ep);
 		else
 			ret = rdma_migrate_id(ep->id, ep->eq->channel);
@@ -627,6 +682,14 @@ static int vrb_ep_enable_xrc(struct vrb_ep *ep)
 		goto done;
 	}
 
+	if (cq->credits < srq_ep->xrc.max_recv_wr) {
+		VERBS_WARN(FI_LOG_EP_CTRL,
+			   "CQ credits %" PRId64 " insufficient\n",
+			   cq->credits);
+		ret = -FI_EINVAL;
+		goto done;
+	}
+
 	memset(&attr, 0, sizeof(attr));
 	attr.attr.max_wr = srq_ep->xrc.max_recv_wr;
 	attr.attr.max_sge = srq_ep->xrc.max_sge;
@@ -648,6 +711,7 @@ static int vrb_ep_enable_xrc(struct vrb_ep *ep)
 	cq->util_cq.cq_fastlock_acquire(&cq->xrc.srq_list_lock);
 	dlist_insert_tail(&srq_ep->xrc.srq_entry, &cq->xrc.srq_list);
 	srq_ep->xrc.cq = cq;
+	cq->credits -= srq_ep->xrc.max_recv_wr;
 	cq->util_cq.cq_fastlock_release(&cq->xrc.srq_list_lock);
 
 	ibv_get_srq_num(srq_ep->srq, &xrc_ep->srqn);
@@ -665,7 +729,7 @@ done:
 }
 
 void vrb_msg_ep_get_qp_attr(struct vrb_ep *ep,
-			       struct ibv_qp_init_attr *attr)
+			    struct ibv_qp_init_attr *attr)
 {
 	attr->qp_context = ep;
 
@@ -673,8 +737,8 @@ void vrb_msg_ep_get_qp_attr(struct vrb_ep *ep,
 		struct vrb_cq *cq = container_of(ep->util_ep.tx_cq,
 						    struct vrb_cq, util_cq);
 
-		attr->cap.max_send_wr = ep->info->tx_attr->size;
-		attr->cap.max_send_sge = ep->info->tx_attr->iov_limit;
+		attr->cap.max_send_wr = ep->info_attr.tx_size;
+		attr->cap.max_send_sge = ep->info_attr.tx_iov_limit;
 		attr->send_cq = cq->cq;
 	} else {
 		struct vrb_cq *cq =
@@ -687,8 +751,8 @@ void vrb_msg_ep_get_qp_attr(struct vrb_ep *ep,
 		struct vrb_cq *cq =
 			container_of(ep->util_ep.rx_cq, struct vrb_cq, util_cq);
 
-		attr->cap.max_recv_wr = ep->info->rx_attr->size;
-		attr->cap.max_recv_sge = ep->info->rx_attr->iov_limit;
+		attr->cap.max_recv_wr = ep->info_attr.rx_size;
+		attr->cap.max_recv_sge = ep->info_attr.rx_iov_limit;
 		attr->recv_cq = cq->cq;
 	} else {
 		struct vrb_cq *cq =
@@ -696,7 +760,7 @@ void vrb_msg_ep_get_qp_attr(struct vrb_ep *ep,
 
 		attr->recv_cq = cq->cq;
 	}
-	attr->cap.max_inline_data = ep->info->tx_attr->inject_size;
+	attr->cap.max_inline_data = ep->info_attr.inject_size;
 	attr->qp_type = IBV_QPT_RC;
 	attr->sq_sig_all = 1;
 
@@ -821,30 +885,30 @@ static int vrb_dgram_ep_setname(fid_t ep_fid, void *addr, size_t addrlen)
 	int ret = FI_SUCCESS;
 
 	ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid.fid);
-	if (addrlen < ep->info->src_addrlen) {
+	if (addrlen < ep->info_attr.src_addrlen) {
 		VERBS_INFO(FI_LOG_EP_CTRL,
 			   "addrlen expected: %zu, got: %zu\n",
-			   ep->info->src_addrlen, addrlen);
+			   ep->info_attr.src_addrlen, addrlen);
 		return -FI_ETOOSMALL;
 	}
 	/*
 	 * save previous address to be able make
 	 * a roll back on the previous one
 	 */
-	save_addr = ep->info->src_addr;
+	save_addr = ep->info_attr.src_addr;
 
-	ep->info->src_addr = calloc(1, ep->info->src_addrlen);
-	if (!ep->info->src_addr) {
-		ep->info->src_addr = save_addr;
+	ep->info_attr.src_addr = calloc(1, ep->info_attr.src_addrlen);
+	if (!ep->info_attr.src_addr) {
+		ep->info_attr.src_addr = save_addr;
 		ret = -FI_ENOMEM;
 		goto err;
 	}
 
-	memcpy(ep->info->src_addr, addr, ep->info->src_addrlen);
-	memcpy(&ep->ep_name, addr, ep->info->src_addrlen);
+	memcpy(ep->info_attr.src_addr, addr, ep->info_attr.src_addrlen);
+	memcpy(&ep->ep_name, addr, ep->info_attr.src_addrlen);
 
 err:
-	ep->info->src_addr = save_addr;
+	ep->info_attr.src_addr = save_addr;
 	return ret;
 }
 
@@ -889,6 +953,39 @@ static struct fi_ops_cm vrb_dgram_cm_ops = {
 	.join = fi_no_join,
 };
 
+static int vrb_ep_save_info_attr(struct vrb_ep *ep, struct fi_info *info)
+{
+	ep->info_attr.protocol = info->ep_attr ? info->ep_attr->protocol:
+	    FI_PROTO_UNSPEC;
+	ep->info_attr.inject_size = info->tx_attr->inject_size;
+	ep->info_attr.tx_size = info->tx_attr->size;
+	ep->info_attr.tx_iov_limit = info->tx_attr->iov_limit;
+	ep->info_attr.rx_size = info->rx_attr->size;
+	ep->info_attr.rx_iov_limit = info->rx_attr->iov_limit;
+	ep->info_attr.addr_format = info->addr_format;
+	ep->info_attr.handle = info->handle;
+
+	if (info->src_addr) {
+		ep->info_attr.src_addr = mem_dup(info->src_addr, info->src_addrlen);
+		if (ep->info_attr.src_addr == NULL) {
+			VERBS_WARN(FI_LOG_EP_CTRL, "Memory error save src addr\n");
+			return -FI_ENOMEM;
+		}
+		ep->info_attr.src_addrlen = info->src_addrlen;
+	}
+	if (info->dest_addr) {
+		ep->info_attr.dest_addr = mem_dup(info->dest_addr, info->dest_addrlen);
+		if (ep->info_attr.dest_addr == NULL) {
+			VERBS_WARN(FI_LOG_EP_CTRL, "Memory error save dest addr\n");
+			free(ep->info_attr.src_addr);
+			ep->info_attr.src_addr = NULL;
+			return -FI_ENOMEM;
+		}
+		ep->info_attr.dest_addrlen = info->dest_addrlen;
+	}
+	return FI_SUCCESS;
+}
+
 int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
 		   struct fid_ep **ep_fid, void *context)
 {
@@ -946,7 +1043,12 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
 		return -FI_ENOMEM;
 	}
 
-	ep->inject_limit = ep->info->tx_attr->inject_size;
+	ep->peer_rq_credits = UINT64_MAX;
+	ep->threshold = INT64_MAX; /* disables RQ flow control */
+
+	ret = vrb_ep_save_info_attr(ep, info);
+	if (ret)
+		goto err1;
 
 	switch (info->ep_attr->type) {
 	case FI_EP_MSG:
@@ -975,8 +1077,8 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
 		if (!info->handle) {
 			/* Only RC, XRC active RDMA CM ID is created at connect */
 			if (!(dom->flags & VRB_USE_XRC)) {
-				ret = vrb_create_ep(info, RDMA_PS_TCP,
-						       &ep->id);
+				ret = vrb_create_ep(ep,
+					vrb_get_port_space(info->addr_format), &ep->id);
 				if (ret)
 					goto err1;
 				ep->id->context = &ep->util_ep.ep_fid.fid;
@@ -1036,13 +1138,17 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
 		goto err1;
 	}
 
-	if (info->ep_attr->rx_ctx_cnt == 0 || 
-	    info->ep_attr->rx_ctx_cnt == 1)
-		ep->rx_cq_size = info->rx_attr->size;
-	
-	if (info->ep_attr->tx_ctx_cnt == 0 || 
-	    info->ep_attr->tx_ctx_cnt == 1)
-		ep->tx_credits = info->tx_attr->size;
+	if (info->ep_attr->rx_ctx_cnt == 0 ||
+	    info->ep_attr->rx_ctx_cnt == 1) {
+		ep->rx_cq_size = info->rx_attr ? info->rx_attr->size :
+				 fi->rx_attr->size;
+	}
+
+	if (info->ep_attr->tx_ctx_cnt == 0 ||
+	    info->ep_attr->tx_ctx_cnt == 1) {
+		ep->sq_credits = info->tx_attr ? info->tx_attr->size :
+				 fi->tx_attr->size;
+	}
 
 	*ep_fid = &ep->util_ep.ep_fid;
 	ep->util_ep.ep_fid.fid.ops = &vrb_ep_ops;
@@ -1075,20 +1181,20 @@ static int vrb_pep_bind(fid_t fid, struct fid *bfid, uint64_t flags)
 	 * it limits an EQ to a single passive endpoint. TODO: implement
 	 * a more general solution.
 	 */
-	if (vrb_is_xrc(pep->info)) {
-	       if (pep->eq->xrc.pep_port) {
+	if (vrb_is_xrc_info(pep->info)) {
+		if (pep->eq->xrc.pep_port) {
 			VERBS_WARN(FI_LOG_EP_CTRL,
 				   "XRC limits EQ binding to a single PEP\n");
 			return -FI_EINVAL;
-	       }
-	       pep->eq->xrc.pep_port = ntohs(rdma_get_src_port(pep->id));
+		}
+		pep->eq->xrc.pep_port = ntohs(rdma_get_src_port(pep->id));
 	}
 
 	ret = rdma_migrate_id(pep->id, pep->eq->channel);
 	if (ret)
 		return -errno;
 
-	if (vrb_is_xrc(pep->info)) {
+	if (vrb_is_xrc_info(pep->info)) {
 		ret = rdma_migrate_id(pep->xrc_ps_udp_id, pep->eq->channel);
 		if (ret)
 			return -errno;
@@ -1177,7 +1283,8 @@ int vrb_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
 		_pep->info->dest_addrlen = 0;
 	}
 
-	ret = rdma_create_id(NULL, &_pep->id, &_pep->pep_fid.fid, RDMA_PS_TCP);
+	ret = rdma_create_id(NULL, &_pep->id, &_pep->pep_fid.fid,
+			     vrb_get_port_space(_pep->info->addr_format));
 	if (ret) {
 		VERBS_INFO(FI_LOG_DOMAIN, "Unable to create PEP rdma_cm_id\n");
 		goto err2;
@@ -1193,7 +1300,7 @@ int vrb_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
 	}
 
 	/* XRC listens on both RDMA_PS_TCP and RDMA_PS_UDP */
-	if (vrb_is_xrc(info)) {
+	if (vrb_is_xrc_info(info)) {
 		ret = rdma_create_id(NULL, &_pep->xrc_ps_udp_id,
 				     &_pep->pep_fid.fid, RDMA_PS_UDP);
 		if (ret) {
@@ -1470,6 +1577,7 @@ int vrb_xrc_close_srq(struct vrb_srq_ep *srq_ep)
 		VERBS_WARN(FI_LOG_EP_CTRL, "Cannot destroy SRQ rc=%d\n", ret);
 		return -ret;
 	}
+	srq_ep->xrc.cq->credits += srq_ep->xrc.max_recv_wr;
 	srq_ep->srq = NULL;
 	srq_ep->xrc.cq = NULL;
 	dlist_remove(&srq_ep->xrc.srq_entry);
@@ -1482,13 +1590,14 @@ static int vrb_srq_close(fid_t fid)
 {
 	struct vrb_srq_ep *srq_ep = container_of(fid, struct vrb_srq_ep,
 						 ep_fid.fid);
+	struct vrb_cq *cq = srq_ep->xrc.cq;
 	int ret;
 
 	if (srq_ep->domain->flags & VRB_USE_XRC) {
-		if (srq_ep->xrc.cq) {
-			fastlock_acquire(&srq_ep->xrc.cq->xrc.srq_list_lock);
+		if (cq) {
+			fastlock_acquire(&cq->xrc.srq_list_lock);
 			ret = vrb_xrc_close_srq(srq_ep);
-			fastlock_release(&srq_ep->xrc.cq->xrc.srq_list_lock);
+			fastlock_release(&cq->xrc.srq_list_lock);
 			if (ret)
 				goto err;
 		}
diff --git a/deps/libfabric/prov/verbs/src/verbs_eq.c b/deps/libfabric/prov/verbs/src/verbs_eq.c
index 6bfa48b5354e27dd2962ffff35d3dbac407d158f..33473b63124ac5adfbfcdb6b64453f74b2401b1d 100644
--- a/deps/libfabric/prov/verbs/src/verbs_eq.c
+++ b/deps/libfabric/prov/verbs/src/verbs_eq.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2013-2015 Intel Corporation, Inc.  All rights reserved.
- * Copyright (c) 2018 Cray Inc. All rights reserved.
+ * Copyright (c) 2018-2019 Cray Inc. All rights reserved.
+ * Copyright (c) 2018-2019 System Fabric Works, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -225,13 +226,15 @@ vrb_eq_cm_getinfo(struct rdma_cm_event *event, struct fi_info *pep_info,
 
 	free((*info)->src_addr);
 
-	(*info)->src_addrlen = vrb_sockaddr_len(rdma_get_local_addr(event->id));
-	if (!((*info)->src_addr = malloc((*info)->src_addrlen)))
+	(*info)->src_addrlen = ofi_sizeofaddr(rdma_get_local_addr(event->id));
+	(*info)->src_addr = malloc((*info)->src_addrlen);
+	if (!((*info)->src_addr))
 		goto err2;
 	memcpy((*info)->src_addr, rdma_get_local_addr(event->id), (*info)->src_addrlen);
 
-	(*info)->dest_addrlen = vrb_sockaddr_len(rdma_get_peer_addr(event->id));
-	if (!((*info)->dest_addr = malloc((*info)->dest_addrlen)))
+	(*info)->dest_addrlen = ofi_sizeofaddr(rdma_get_peer_addr(event->id));
+	(*info)->dest_addr = malloc((*info)->dest_addrlen);
+	if (!((*info)->dest_addr))
 		goto err2;
 	memcpy((*info)->dest_addr, rdma_get_peer_addr(event->id), (*info)->dest_addrlen);
 
@@ -248,7 +251,7 @@ vrb_eq_cm_getinfo(struct rdma_cm_event *event, struct fi_info *pep_info,
 	connreq->handle.fclass = FI_CLASS_CONNREQ;
 	connreq->id = event->id;
 
-	if (vrb_is_xrc(*info)) {
+	if (vrb_is_xrc_info(*info)) {
 		connreq->is_xrc = 1;
 		ret = vrb_eq_set_xrc_info(event, &connreq->xrc);
 		if (ret)
@@ -281,6 +284,17 @@ static inline int vrb_eq_copy_event_data(struct fi_eq_cm_entry *entry,
 	return datalen;
 }
 
+static void vrb_eq_skip_rdma_cm_hdr(const void **priv_data,
+						size_t *priv_data_len)
+{
+	size_t rdma_cm_hdr_len = sizeof(struct vrb_rdma_cm_hdr);
+
+	if (*priv_data_len > rdma_cm_hdr_len) {
+		*priv_data = (void*)((char *)*priv_data + rdma_cm_hdr_len);
+		*priv_data_len -= rdma_cm_hdr_len;
+	}
+}
+
 static void vrb_eq_skip_xrc_cm_data(const void **priv_data,
 				       size_t *priv_data_len)
 {
@@ -309,19 +323,19 @@ static int vrb_sidr_conn_compare(struct ofi_rbmap *map,
 	int ret;
 
 	assert(_key->addr->sa_family ==
-	       ofi_sa_family(ep->base_ep.info->dest_addr));
+	       ofi_sa_family(ep->base_ep.info_attr.dest_addr));
 
 	/* The interface address and the passive endpoint port define
 	 * the unique connection to a peer */
 	switch(_key->addr->sa_family) {
 	case AF_INET:
 		ret = memcmp(&ofi_sin_addr(_key->addr),
-			     &ofi_sin_addr(ep->base_ep.info->dest_addr),
+			     &ofi_sin_addr(ep->base_ep.info_attr.dest_addr),
 			     sizeof(ofi_sin_addr(_key->addr)));
 		break;
 	case AF_INET6:
 		ret = memcmp(&ofi_sin6_addr(_key->addr),
-			     &ofi_sin6_addr(ep->base_ep.info->dest_addr),
+			     &ofi_sin6_addr(ep->base_ep.info_attr.dest_addr),
 			     sizeof(ofi_sin6_addr(_key->addr)));
 		break;
 	default:
@@ -367,7 +381,7 @@ int vrb_eq_add_sidr_conn(struct vrb_xrc_ep *ep,
 	assert(param_len);
 	assert(ep->tgt_id && ep->tgt_id->ps == RDMA_PS_UDP);
 
-	vrb_set_sidr_conn_key(ep->base_ep.info->dest_addr,
+	vrb_set_sidr_conn_key(ep->base_ep.info_attr.dest_addr,
 				 ep->remote_pep_port, ep->recip_accept, &key);
 	ep->accept_param_data = calloc(1, param_len);
 	if (!ep->accept_param_data) {
@@ -506,7 +520,7 @@ vrb_eq_xrc_connreq_event(struct vrb_eq *eq, struct fi_eq_cm_entry *entry,
 
 	ep->tgt_id = connreq->id;
 	ep->tgt_id->context = &ep->base_ep.util_ep.ep_fid.fid;
-	ep->base_ep.info->handle = entry->info->handle;
+	ep->base_ep.info_attr.handle = entry->info->handle;
 
 	ret = rdma_migrate_id(ep->tgt_id, ep->base_ep.eq->channel);
 	if (ret) {
@@ -694,14 +708,14 @@ static inline int
 vrb_eq_xrc_connect_retry(struct vrb_xrc_ep *ep,
 			 struct rdma_cm_event *cma_event, int *acked)
 {
-	if (ep->base_ep.info->src_addr)
+	if (ep->base_ep.info_attr.src_addr)
 		ofi_straddr_dbg(&vrb_prov, FI_LOG_EP_CTRL,
 				"Connect retry src ",
-				ep->base_ep.info->src_addr);
-	if (ep->base_ep.info->dest_addr)
+				ep->base_ep.info_attr.src_addr);
+	if (ep->base_ep.info_attr.dest_addr)
 		ofi_straddr_dbg(&vrb_prov, FI_LOG_EP_CTRL,
 				"Connect retry dest ",
-				ep->base_ep.info->dest_addr);
+				ep->base_ep.info_attr.dest_addr);
 
 	*acked = 1;
 	rdma_ack_cm_event(cma_event);
@@ -752,12 +766,12 @@ vrb_eq_xrc_cm_err_event(struct vrb_eq *eq,
 
 	VERBS_WARN(FI_LOG_EP_CTRL, "CM error event %s, status %d\n",
 		   rdma_event_str(cma_event->event), cma_event->status);
-	if (ep->base_ep.info->src_addr)
+	if (ep->base_ep.info_attr.src_addr)
 		ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL,
-				"Src ", ep->base_ep.info->src_addr);
-	if (ep->base_ep.info->dest_addr)
+				"Src ", ep->base_ep.info_attr.src_addr);
+	if (ep->base_ep.info_attr.dest_addr)
 		ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL,
-				"Dest ", ep->base_ep.info->dest_addr);
+				"Dest ", ep->base_ep.info_attr.dest_addr);
         ep->conn_state = VRB_XRC_ERROR;
         return FI_SUCCESS;
 }
@@ -860,7 +874,7 @@ vrb_eq_cm_process_event(struct vrb_eq *eq,
 			FI_WARN(&vrb_prov, FI_LOG_EP_CTRL,
 				"rdma_connect failed: %s (%d)\n",
 				strerror(-ret), -ret);
-			if (vrb_is_xrc(ep->info)) {
+			if (vrb_is_xrc_ep(ep)) {
 				xrc_ep = container_of(fid, struct vrb_xrc_ep,
 						      base_ep.util_ep.ep_fid);
 				vrb_put_shared_ini_conn(xrc_ep);
@@ -882,7 +896,7 @@ vrb_eq_cm_process_event(struct vrb_eq *eq,
 			goto err;
 		}
 
-		if (vrb_is_xrc(entry->info)) {
+		if (vrb_is_xrc_info(entry->info)) {
 			ret = vrb_eq_xrc_connreq_event(eq, entry, len, event,
 							  cma_event, &acked,
 							  &priv_data, &priv_datalen);
@@ -893,6 +907,8 @@ vrb_eq_cm_process_event(struct vrb_eq *eq,
 			}
 			if (*event == FI_CONNECTED)
 				goto ack;
+		} else if (cma_event->id->route.addr.src_addr.sa_family == AF_IB) {
+			vrb_eq_skip_rdma_cm_hdr(&priv_data, &priv_datalen);
 		}
 		break;
 	case RDMA_CM_EVENT_CONNECT_RESPONSE:
@@ -907,7 +923,7 @@ vrb_eq_cm_process_event(struct vrb_eq *eq,
 				goto ack;
 		}
 		ep = container_of(fid, struct vrb_ep, util_ep.ep_fid);
-		if (vrb_is_xrc(ep->info)) {
+		if (vrb_is_xrc_ep(ep)) {
 			ret = vrb_eq_xrc_connected_event(eq, cma_event,
 							    &acked, entry, len,
 							    event);
@@ -917,7 +933,7 @@ vrb_eq_cm_process_event(struct vrb_eq *eq,
 		break;
 	case RDMA_CM_EVENT_DISCONNECTED:
 		ep = container_of(fid, struct vrb_ep, util_ep.ep_fid);
-		if (vrb_is_xrc(ep->info)) {
+		if (vrb_is_xrc_ep(ep)) {
 			vrb_eq_xrc_disconnect_event(eq, cma_event, &acked);
 			ret = -FI_EAGAIN;
 			goto ack;
@@ -927,7 +943,7 @@ vrb_eq_cm_process_event(struct vrb_eq *eq,
 		break;
 	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
 		ep = container_of(fid, struct vrb_ep, util_ep.ep_fid);
-		if (vrb_is_xrc(ep->info))
+		if (vrb_is_xrc_ep(ep))
 			vrb_eq_xrc_timewait_event(eq, cma_event, &acked);
 		ret = -FI_EAGAIN;
 		goto ack;
@@ -936,8 +952,7 @@ vrb_eq_cm_process_event(struct vrb_eq *eq,
 	case RDMA_CM_EVENT_CONNECT_ERROR:
 	case RDMA_CM_EVENT_UNREACHABLE:
 		ep = container_of(fid, struct vrb_ep, util_ep.ep_fid);
-		assert(ep->info);
-		if (vrb_is_xrc(ep->info)) {
+		if (vrb_is_xrc_ep(ep)) {
 			/* SIDR Reject is reported as UNREACHABLE unless
 			 * status is negative */
 			if (cma_event->id->ps == RDMA_PS_UDP &&
@@ -963,7 +978,7 @@ vrb_eq_cm_process_event(struct vrb_eq *eq,
 		goto err;
 	case RDMA_CM_EVENT_REJECTED:
 		ep = container_of(fid, struct vrb_ep, util_ep.ep_fid);
-		if (vrb_is_xrc(ep->info)) {
+		if (vrb_is_xrc_ep(ep)) {
 xrc_shared_reject:
 			ret = vrb_eq_xrc_rej_event(eq, cma_event);
 			if (ret == -FI_EAGAIN)
diff --git a/deps/libfabric/prov/verbs/src/verbs_info.c b/deps/libfabric/prov/verbs/src/verbs_info.c
index e15f4b0020c0e130600838c056bde7dc5940ee6f..acdd96da8575b86ec0a8d1abe42a6c0044d7d414 100644
--- a/deps/libfabric/prov/verbs/src/verbs_info.c
+++ b/deps/libfabric/prov/verbs/src/verbs_info.c
@@ -35,6 +35,7 @@
 #include <ifaddrs.h>
 #include <net/if.h>
 #include <stdint.h>
+#include <rdma/rdma_cma.h>
 
 #include "fi_verbs.h"
 
@@ -292,8 +293,9 @@ static int vrb_check_hints(uint32_t version, const struct fi_info *hints,
 	return FI_SUCCESS;
 }
 
-int vrb_fi_to_rai(const struct fi_info *fi, uint64_t flags,
-		     struct rdma_addrinfo *rai)
+int vrb_set_rai(uint32_t addr_format, void *src_addr, size_t src_addrlen,
+	void *dest_addr, size_t dest_addrlen, uint64_t flags,
+	struct rdma_addrinfo *rai)
 {
 	memset(rai, 0, sizeof *rai);
 	if (flags & FI_SOURCE)
@@ -302,49 +304,49 @@ int vrb_fi_to_rai(const struct fi_info *fi, uint64_t flags,
 		rai->ai_flags |= RAI_NUMERICHOST;
 
 	rai->ai_qp_type = IBV_QPT_RC;
-	rai->ai_port_space = RDMA_PS_TCP;
 
-	if (!fi)
-		return 0;
-
-	switch(fi->addr_format) {
+	switch(addr_format) {
 	case FI_SOCKADDR_IN:
 	case FI_FORMAT_UNSPEC:
+		rai->ai_port_space = RDMA_PS_TCP;
 		rai->ai_family = AF_INET;
 		rai->ai_flags |= RAI_FAMILY;
 		break;
 	case FI_SOCKADDR_IN6:
+		rai->ai_port_space = RDMA_PS_TCP;
 		rai->ai_family = AF_INET6;
 		rai->ai_flags |= RAI_FAMILY;
 		break;
 	case FI_SOCKADDR_IB:
+		rai->ai_port_space = RDMA_PS_IB;
 		rai->ai_family = AF_IB;
 		rai->ai_flags |= RAI_FAMILY;
 		break;
 	case FI_SOCKADDR:
-		if (fi->src_addrlen) {
-			rai->ai_family = ((struct sockaddr *)fi->src_addr)->sa_family;
+		rai->ai_port_space = RDMA_PS_TCP;
+		if (src_addrlen) {
+			rai->ai_family = ((struct sockaddr *)src_addr)->sa_family;
 			rai->ai_flags |= RAI_FAMILY;
-		} else if (fi->dest_addrlen) {
-			rai->ai_family = ((struct sockaddr *)fi->dest_addr)->sa_family;
+		} else if (dest_addrlen) {
+			rai->ai_family = ((struct sockaddr *)dest_addr)->sa_family;
 			rai->ai_flags |= RAI_FAMILY;
 		}
 		break;
 	default:
-		VERBS_INFO(FI_LOG_FABRIC, "Unknown fi->addr_format\n");
+		VERBS_INFO(FI_LOG_FABRIC, "Unknown addr_format\n");
 	}
 
-	if (fi->src_addrlen) {
-		if (!(rai->ai_src_addr = malloc(fi->src_addrlen)))
+	if (src_addrlen) {
+		if (!(rai->ai_src_addr = malloc(src_addrlen)))
 			return -FI_ENOMEM;
-		memcpy(rai->ai_src_addr, fi->src_addr, fi->src_addrlen);
-		rai->ai_src_len = fi->src_addrlen;
+		memcpy(rai->ai_src_addr, src_addr, src_addrlen);
+		rai->ai_src_len = src_addrlen;
 	}
-	if (fi->dest_addrlen) {
-		if (!(rai->ai_dst_addr = malloc(fi->dest_addrlen)))
+	if (dest_addrlen) {
+		if (!(rai->ai_dst_addr = malloc(dest_addrlen)))
 			return -FI_ENOMEM;
-		memcpy(rai->ai_dst_addr, fi->dest_addr, fi->dest_addrlen);
-		rai->ai_dst_len = fi->dest_addrlen;
+		memcpy(rai->ai_dst_addr, dest_addr, dest_addrlen);
+		rai->ai_dst_len = dest_addrlen;
 	}
 
 	return 0;
@@ -534,55 +536,6 @@ static const char *vrb_link_layer_str(uint8_t link_layer)
 	}
 }
 
-static size_t vrb_speed(uint8_t speed, uint8_t width)
-{
-	const size_t gbit_2_bit_coef = 1024 * 1024;
-	size_t width_val, speed_val;
-
-	switch (speed) {
-	case 1:
-		speed_val = (size_t) (2.5 * (float) gbit_2_bit_coef);
-		break;
-	case 2:
-		speed_val = 5 * gbit_2_bit_coef;
-		break;
-	case 4:
-	case 8:
-		speed_val = 8 * gbit_2_bit_coef;
-		break;
-	case 16:
-		speed_val = 14 * gbit_2_bit_coef;
-		break;
-	case 32:
-		speed_val = 25 * gbit_2_bit_coef;
-		break;
-	default:
-		speed_val = 0;
-		break;
-	}
-
-	switch (width) {
-	case 1:
-		width_val = 1;
-		break;
-	case 2:
-		width_val = 4;
-		break;
-	case 4:
-		width_val = 8;
-		break;
-	case 8:
-		width_val = 12;
-		break;
-	default:
-		width_val = 0;
-		break;
-	}
-
-	return width_val * speed_val;
-}
-
-
 static int vrb_get_device_attrs(struct ibv_context *ctx,
 				   struct fi_info *info, uint32_t protocol)
 {
@@ -717,8 +670,8 @@ static int vrb_get_device_attrs(struct ibv_context *ctx,
 
 	mtu_size = vrb_mtu_type_to_len(port_attr.active_mtu);
 	info->nic->link_attr->mtu = (size_t) (mtu_size > 0 ? mtu_size : 0);
-	info->nic->link_attr->speed = vrb_speed(port_attr.active_speed,
-						   port_attr.active_width);
+	info->nic->link_attr->speed = ofi_vrb_speed(port_attr.active_speed,
+						    port_attr.active_width);
 	info->nic->link_attr->state =
 		vrb_pstate_2_lstate(port_attr.state);
 	info->nic->link_attr->network_type =
@@ -814,7 +767,7 @@ static int vrb_alloc_info(struct ibv_context *ctx, struct fi_info **info,
 		assert(0);
 		return -FI_EINVAL;
 	}
-		
+
 
 	*(fi->fabric_attr) = verbs_fabric_attr;
 
@@ -946,8 +899,10 @@ static int verbs_devs_add(struct dlist_entry *verbs_devs, char *dev_name,
 	addr->rai = rai;
 
 	dlist_foreach_container(verbs_devs, struct verbs_dev_info, dev, entry)
-		if (!strcmp(dev_name, dev->name))
+		if (!strcmp(dev_name, dev->name)) {
+			free(dev_name);
 			goto add_rai;
+		}
 
 	if (!(dev = malloc(sizeof(*dev))))
 		goto err1;
@@ -968,7 +923,7 @@ err1:
 static int vrb_ifa_rdma_info(const struct ifaddrs *ifa, char **dev_name,
 				struct rdma_addrinfo **rai)
 {
-	char name[INET6_ADDRSTRLEN];
+	char name[INET6_ADDRSTRLEN + 16];
 	struct rdma_addrinfo rai_hints = {
 		.ai_flags = RAI_PASSIVE | RAI_NUMERICHOST,
 	}, *rai_;
@@ -987,9 +942,8 @@ static int vrb_ifa_rdma_info(const struct ifaddrs *ifa, char **dev_name,
 	 * TODO should we do something similar for IPv4? */
 	if (!strncmp(name, IPV6_LINK_LOCAL_ADDR_PREFIX_STR,
 		     strlen(IPV6_LINK_LOCAL_ADDR_PREFIX_STR))) {
-		assert(strlen(name) + strlen(ifa->ifa_name) < INET6_ADDRSTRLEN);
-		strcat(name, "%");
-		strcat(name, ifa->ifa_name);
+		strncat(name, "%", sizeof(name) - strlen(name) - 1);
+		strncat(name, ifa->ifa_name, sizeof(name) - strlen(name) - 1);
 	}
 
 	ret = rdma_getaddrinfo((char *) name, NULL, &rai_hints, &rai_);
@@ -1031,6 +985,112 @@ err1:
 	return ret;
 }
 
+int vrb_get_port_space(uint32_t addr_format)
+{
+	if (addr_format == FI_SOCKADDR_IB)
+		return RDMA_PS_IB;
+	else
+		return RDMA_PS_TCP;
+}
+
+static struct rdma_addrinfo *vrb_alloc_ib_addrinfo(uint8_t port_num,
+			const union ibv_gid *gid, uint16_t pkey)
+{
+	struct rdma_addrinfo *rai;
+	struct sockaddr_ib *sib;
+
+	rai = calloc(1, sizeof(struct rdma_addrinfo));
+	if (!rai)
+		return NULL;
+
+	rai->ai_flags = RAI_PASSIVE | RAI_NUMERICHOST | RAI_FAMILY;
+	rai->ai_family = AF_IB;
+	rai->ai_port_space = RDMA_PS_IB;
+
+	sib = calloc(1, sizeof(struct sockaddr_ib));
+	if (!sib) {
+		free(rai);
+		return NULL;
+	}
+	rai->ai_src_addr = (struct sockaddr *) sib;
+	rai->ai_src_len = sizeof(struct sockaddr_ib);
+
+	sib->sib_family = AF_IB;
+	memcpy(&sib->sib_addr.sib_raw, &gid->raw, sizeof(*gid));
+	sib->sib_pkey = pkey;
+	sib->sib_scope_id = port_num;
+
+	ofi_addr_set_port((struct sockaddr *)sib, 0);
+
+	return rai;
+}
+
+static int vrb_get_sib(struct dlist_entry *verbs_devs)
+{
+	struct rdma_addrinfo *rai = NULL;
+	struct ibv_device **devices;
+	char *dev_name = NULL;
+	int num_devices;
+	struct ibv_context *context;
+	int ret, num_verbs_ifs = 0;
+	struct ibv_device_attr device_attr;
+	struct ibv_port_attr port_attr;
+	union ibv_gid gid;
+	uint16_t pkey;
+
+	devices = ibv_get_device_list(&num_devices);
+	if (!devices)
+		return -errno;
+
+	for (int dev = 0; dev < num_devices; dev++) {
+		context = ibv_open_device(devices[dev]);
+
+		ret = ibv_query_device(context, &device_attr);
+		if (ret)
+			continue;
+
+		for (int port = 1; port <= device_attr.phys_port_cnt; port++) {
+			ret = ibv_query_port(context, port, &port_attr);
+			if (ret)
+				continue;
+
+			for (int gidx = 0; gidx < port_attr.gid_tbl_len; gidx++) {
+				/* gid_tbl_len may contain GID entries that are NULL (fe80::),
+				 * so we need to filter them out */
+				ret = ibv_query_gid(context, port, gidx, &gid);
+				if (ret || !gid.global.interface_id || !gid.global.subnet_prefix)
+					continue;
+
+				for (int pidx = 0; pidx < port_attr.pkey_tbl_len; pidx++) {
+					ret = ibv_query_pkey(context, port, pidx, &pkey);
+					if (ret || !pkey)
+						continue;
+
+					rai = vrb_alloc_ib_addrinfo(port, &gid, pkey);
+					if (!rai)
+						continue;
+
+					dev_name = strdup(ibv_get_device_name(context->device));
+					if (!dev_name)
+						return -FI_ENOMEM;
+
+					ret = verbs_devs_add(verbs_devs, dev_name, rai);
+					if (ret) {
+						free(dev_name);
+						rdma_freeaddrinfo(rai);
+						continue;
+					}
+
+					num_verbs_ifs++;
+				}
+			}
+		}
+	}
+
+	ibv_free_device_list(devices);
+	return num_verbs_ifs ? 0 : -FI_ENODATA;
+}
+
 /* Builds a list of interfaces that correspond to active verbs devices */
 static int vrb_getifaddrs(struct dlist_entry *verbs_devs)
 {
@@ -1157,18 +1217,6 @@ static int vrb_get_srcaddr_devs(struct fi_info **info)
 	return 0;
 }
 
-static void vrb_sockaddr_set_port(struct sockaddr *sa, uint16_t port)
-{
-	switch(sa->sa_family) {
-	case AF_INET:
-		((struct sockaddr_in *)sa)->sin_port = port;
-		break;
-	case AF_INET6:
-		((struct sockaddr_in6 *)sa)->sin6_port = port;
-		break;
-	}
-}
-
 /* the `rai` parameter is used for the MSG EP type */
 /* the `fmt`, `[src | dest]_addr` parameters are used for the DGRAM EP type */
 /* if the `fmt` parameter isn't used, pass FI_FORMAT_UNSPEC */
@@ -1228,21 +1276,17 @@ static int vrb_fill_addr(struct rdma_addrinfo *rai, struct fi_info **info,
 	 * though it fills the destination address (presence of id->verbs
 	 * corresponds to a valid dest addr) */
 	local_addr = rdma_get_local_addr(id);
-	if (!local_addr) {
-		VERBS_WARN(FI_LOG_CORE,
-			   "Unable to get local address\n");
-		return -FI_ENODATA;
-	}
 
-	rai->ai_src_len = vrb_sockaddr_len(local_addr);
-	if (!(rai->ai_src_addr = malloc(rai->ai_src_len)))
+	rai->ai_src_len = ofi_sizeofaddr(local_addr);
+	rai->ai_src_addr = malloc(rai->ai_src_len);
+	if (!rai->ai_src_addr)
 		return -FI_ENOMEM;
 
 	memcpy(rai->ai_src_addr, local_addr, rai->ai_src_len);
 	/* User didn't specify a port. Zero out the random port
 	 * assigned by rdmamcm so that this rai/fi_info can be
 	 * used multiple times to create rdma endpoints.*/
-	vrb_sockaddr_set_port(rai->ai_src_addr, 0);
+	ofi_addr_set_port(rai->ai_src_addr, 0);
 
 rai_to_fi:
 	return vrb_set_info_addrs(*info, rai, FI_FORMAT_UNSPEC,
@@ -1288,6 +1332,8 @@ int vrb_init_info(const struct fi_info **all_infos)
 	}
 
 	vrb_getifaddrs(&verbs_devs);
+	if (!vrb_gl_data.iface)
+		vrb_get_sib(&verbs_devs);
 
 	if (dlist_empty(&verbs_devs))
 		FI_WARN(&vrb_prov, FI_LOG_FABRIC,
diff --git a/deps/libfabric/prov/verbs/src/verbs_mr.c b/deps/libfabric/prov/verbs/src/verbs_mr.c
index 0fae9d19f56ee89cfc921766b3da47ba8a168739..204d17985970f50e3b62c91b99ae2f85b3e07fa6 100644
--- a/deps/libfabric/prov/verbs/src/verbs_mr.c
+++ b/deps/libfabric/prov/verbs/src/verbs_mr.c
@@ -189,7 +189,7 @@ static int vrb_mr_cache_close(fid_t fid)
 {
 	struct vrb_mem_desc *md =
 		container_of(fid, struct vrb_mem_desc, mr_fid.fid);
-	
+
 	ofi_mr_cache_delete(&md->domain->cache, md->entry);
 	return FI_SUCCESS;
 }
@@ -259,6 +259,7 @@ vrb_mr_cache_reg(struct fid *fid, const void *buf, size_t len,
 	attr.offset = offset;
 	attr.requested_key = requested_key;
 	attr.auth_key_size = 0;
+	attr.iface = FI_HMEM_SYSTEM;
 
 	ret = (flags & OFI_MR_NOCACHE) ?
 	      ofi_mr_cache_reg(&domain->cache, &attr, &entry) :
diff --git a/deps/libfabric/prov/verbs/src/verbs_msg.c b/deps/libfabric/prov/verbs/src/verbs_msg.c
index c7639bc4178df010f5a0314d72e143e622a98317..13c0d9681d4066cfc5d720d719fca1f09bd6b96e 100644
--- a/deps/libfabric/prov/verbs/src/verbs_msg.c
+++ b/deps/libfabric/prov/verbs/src/verbs_msg.c
@@ -185,7 +185,7 @@ vrb_msg_inject_fast(struct fid_ep *ep_fid, const void *buf, size_t len,
 	ep->wrs->sge.addr = (uintptr_t) buf;
 	ep->wrs->sge.length = (uint32_t) len;
 
-	return vrb_post_send(ep, &ep->wrs->msg_wr);
+	return vrb_post_send(ep, &ep->wrs->msg_wr, 0);
 }
 
 static ssize_t vrb_msg_ep_injectdata_fast(struct fid_ep *ep_fid, const void *buf, size_t len,
@@ -201,7 +201,7 @@ static ssize_t vrb_msg_ep_injectdata_fast(struct fid_ep *ep_fid, const void *buf
 	ep->wrs->sge.addr = (uintptr_t) buf;
 	ep->wrs->sge.length = (uint32_t) len;
 
-	ret = vrb_post_send(ep, &ep->wrs->msg_wr);
+	ret = vrb_post_send(ep, &ep->wrs->msg_wr, 0);
 	ep->wrs->msg_wr.opcode = IBV_WR_SEND;
 	return ret;
 }
diff --git a/deps/libfabric/prov/verbs/src/verbs_rma.c b/deps/libfabric/prov/verbs/src/verbs_rma.c
index 35d4521d9cabaf3eec29b61afc0e1e4e1b2d8e9d..d52cd3a178d316f509df12dc50f00dad84e9c028 100644
--- a/deps/libfabric/prov/verbs/src/verbs_rma.c
+++ b/deps/libfabric/prov/verbs/src/verbs_rma.c
@@ -134,7 +134,7 @@ vrb_msg_ep_rma_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc
 
 	vrb_set_sge_iov(wr.sg_list, iov, count, desc);
 
-	return vrb_post_send(ep, &wr);
+	return vrb_post_send(ep, &wr, 0);
 }
 
 static ssize_t
@@ -153,7 +153,7 @@ vrb_msg_ep_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
 
 	vrb_set_sge_iov(wr.sg_list, msg->msg_iov, msg->iov_count, msg->desc);
 
-	return vrb_post_send(ep, &wr);
+	return vrb_post_send(ep, &wr, 0);
 }
 
 static ssize_t
@@ -206,7 +206,7 @@ vrb_rma_write_fast(struct fid_ep *ep_fid, const void *buf, size_t len,
 	ep->wrs->sge.addr = (uintptr_t) buf;
 	ep->wrs->sge.length = (uint32_t) len;
 
-	return vrb_post_send(ep, &ep->wrs->rma_wr);
+	return vrb_post_send(ep, &ep->wrs->rma_wr, 0);
 }
 
 static ssize_t
@@ -245,7 +245,7 @@ vrb_msg_ep_rma_inject_writedata_fast(struct fid_ep *ep_fid, const void *buf, siz
 	ep->wrs->sge.addr = (uintptr_t) buf;
 	ep->wrs->sge.length = (uint32_t) len;
 
-	ret = vrb_post_send(ep, &ep->wrs->rma_wr);
+	ret = vrb_post_send(ep, &ep->wrs->rma_wr, 0);
 	ep->wrs->rma_wr.opcode = IBV_WR_RDMA_WRITE;
 	return ret;
 }
@@ -377,7 +377,7 @@ vrb_msg_xrc_ep_rma_readv(struct fid_ep *ep_fid, const struct iovec *iov,
 
 	vrb_set_sge_iov(wr.sg_list, iov, count, desc);
 
-	return vrb_post_send(&ep->base_ep, &wr);
+	return vrb_post_send(&ep->base_ep, &wr, 0);
 }
 
 static ssize_t
@@ -399,7 +399,7 @@ vrb_msg_xrc_ep_rma_readmsg(struct fid_ep *ep_fid,
 
 	vrb_set_sge_iov(wr.sg_list, msg->msg_iov, msg->iov_count, msg->desc);
 
-	return vrb_post_send(&ep->base_ep, &wr);
+	return vrb_post_send(&ep->base_ep, &wr, flags);
 }
 
 static ssize_t
@@ -456,7 +456,7 @@ vrb_xrc_rma_write_fast(struct fid_ep *ep_fid, const void *buf,
 	ep->base_ep.wrs->sge.addr = (uintptr_t) buf;
 	ep->base_ep.wrs->sge.length = (uint32_t) len;
 
-	return vrb_post_send(&ep->base_ep, &ep->base_ep.wrs->rma_wr);
+	return vrb_post_send(&ep->base_ep, &ep->base_ep.wrs->rma_wr, 0);
 }
 
 static ssize_t
@@ -499,7 +499,7 @@ vrb_msg_xrc_ep_rma_inject_writedata_fast(struct fid_ep *ep_fid,
 	ep->base_ep.wrs->sge.addr = (uintptr_t) buf;
 	ep->base_ep.wrs->sge.length = (uint32_t) len;
 
-	ret = vrb_post_send(&ep->base_ep, &ep->base_ep.wrs->rma_wr);
+	ret = vrb_post_send(&ep->base_ep, &ep->base_ep.wrs->rma_wr, 0);
 	ep->base_ep.wrs->rma_wr.opcode = IBV_WR_RDMA_WRITE;
 	return ret;
 }
diff --git a/deps/libfabric/src/abi_1_0.c b/deps/libfabric/src/abi_1_0.c
index eb3bd0eec3f1db831e7aa4cbdc31ee173bfe5fdf..34d8e605b6bd013f9d4ae2d44af61cdb771eb4d1 100644
--- a/deps/libfabric/src/abi_1_0.c
+++ b/deps/libfabric/src/abi_1_0.c
@@ -88,24 +88,22 @@ struct fi_ep_attr_1_0 {
 	size_t			rx_ctx_cnt;
 };
 
-struct fi_info_1_0 {
-	struct fi_info			*next;
-	uint64_t			caps;
-	uint64_t			mode;
-	uint32_t			addr_format;
-	size_t				src_addrlen;
-	size_t				dest_addrlen;
-	void				*src_addr;
-	void				*dest_addr;
-	fid_t				handle;
-	struct fi_tx_attr		*tx_attr;
-	struct fi_rx_attr		*rx_attr;
-	struct fi_ep_attr_1_0		*ep_attr;
-	struct fi_domain_attr_1_0	*domain_attr;
-	struct fi_fabric_attr_1_0	*fabric_attr;
+struct fi_tx_attr_1_0 {
+        uint64_t                caps;
+        uint64_t                mode;
+        uint64_t                op_flags;
+        uint64_t                msg_order;
+        uint64_t                comp_order;
+        size_t                  inject_size;
+        size_t                  size;
+        size_t                  iov_limit;
+        size_t                  rma_iov_limit;
 };
 
-struct fi_info_1_1 {
+/* External structure is still ABI 1.0 compliant */
+#define fi_rx_attr_1_0 fi_rx_attr
+
+struct fi_info_1_0 {
 	struct fi_info			*next;
 	uint64_t			caps;
 	uint64_t			mode;
@@ -115,42 +113,14 @@ struct fi_info_1_1 {
 	void				*src_addr;
 	void				*dest_addr;
 	fid_t				handle;
-	struct fi_tx_attr		*tx_attr;
-	struct fi_rx_attr		*rx_attr;
+	struct fi_tx_attr_1_0		*tx_attr;
+	struct fi_rx_attr_1_0		*rx_attr;
 	struct fi_ep_attr_1_0		*ep_attr;
 	struct fi_domain_attr_1_0	*domain_attr;
 	struct fi_fabric_attr_1_0	*fabric_attr;
 };
 
-struct fi_tx_attr_1_2 {
-        uint64_t                caps;
-        uint64_t                mode;
-        uint64_t                op_flags;
-        uint64_t                msg_order;
-        uint64_t                comp_order;
-        size_t                  inject_size;
-        size_t                  size;
-        size_t                  iov_limit;
-        size_t                  rma_iov_limit;
-};
-
-struct fi_ep_attr_1_2 {
-        enum fi_ep_type         type;
-        uint32_t                protocol;
-        uint32_t                protocol_version;
-        size_t                  max_msg_size;
-        size_t                  msg_prefix_size;
-        size_t                  max_order_raw_size;
-        size_t                  max_order_war_size;
-        size_t                  max_order_waw_size;
-        uint64_t                mem_tag_format;
-        size_t                  tx_ctx_cnt;
-        size_t                  rx_ctx_cnt;
-        size_t                  auth_key_size;
-        uint8_t                 *auth_key;
-};
-
-struct fi_domain_attr_1_2 {
+struct fi_domain_attr_1_1 {
         struct fid_domain       *domain;
         char                    *name;
         enum fi_threading       threading;
@@ -179,6 +149,35 @@ struct fi_domain_attr_1_2 {
         size_t                  mr_cnt;
 };
 
+#define fi_tx_attr_1_1 fi_tx_attr_1_0
+#define fi_rx_attr_1_1 fi_rx_attr_1_0
+#define fi_ep_attr_1_1 fi_ep_attr
+#define fi_fabric_attr_1_1 fi_fabric_attr
+
+struct fi_info_1_1 {
+	struct fi_info			*next;
+	uint64_t			caps;
+	uint64_t			mode;
+	uint32_t			addr_format;
+	size_t				src_addrlen;
+	size_t				dest_addrlen;
+	void				*src_addr;
+	void				*dest_addr;
+	fid_t				handle;
+	struct fi_tx_attr_1_1		*tx_attr;
+	struct fi_rx_attr_1_1		*rx_attr;
+	struct fi_ep_attr_1_1		*ep_attr;
+	struct fi_domain_attr_1_1	*domain_attr;
+	struct fi_fabric_attr_1_1	*fabric_attr;
+};
+
+#define fi_tx_attr_1_2 fi_tx_attr_1_1
+#define fi_rx_attr_1_2 fi_rx_attr_1_1
+#define fi_ep_attr_1_2 fi_ep_attr_1_1
+#define fi_domain_attr_1_2 fi_domain_attr_1_1
+#define fi_fabric_attr_1_2 fi_fabric_attr_1_1
+#define fid_nic_1_2 fid_nic
+
 struct fi_info_1_2 {
         struct fi_info            *next;
         uint64_t                  caps;
@@ -190,13 +189,22 @@ struct fi_info_1_2 {
         void                      *dest_addr;
         fid_t                     handle;
         struct fi_tx_attr_1_2     *tx_attr;
-        struct fi_rx_attr         *rx_attr;
+        struct fi_rx_attr_1_2      *rx_attr;
         struct fi_ep_attr_1_2     *ep_attr;
         struct fi_domain_attr_1_2 *domain_attr;
-        struct fi_fabric_attr     *fabric_attr;
-        struct fid_nic            *nic;
+        struct fi_fabric_attr_1_2 *fabric_attr;
+        struct fid_nic_1_2        *nic;
 };
 
+/*
+#define fi_tx_attr_1_3 fi_tx_attr
+#define fi_rx_attr_1_3 fi_rx_attr_1_2
+#define fi_ep_attr_1_3 fi_ep_attr_1_2
+#define fi_domain_attr_1_3 fi_domain_attr
+#define fi_fabric_attr_1_3 fi_fabric_attr_1_2
+fi_info_1_3 -> fi_info
+*/
+
 #define ofi_dup_attr(dst, src)				\
 	do {						\
 		dst = calloc(1, sizeof(*dst));		\
diff --git a/deps/libfabric/src/common.c b/deps/libfabric/src/common.c
index a2b15af2d7a31a88eabe91805fe6ff0349c51a29..4c54dc2dec680136a9114236ac727ef6c1af7492 100644
--- a/deps/libfabric/src/common.c
+++ b/deps/libfabric/src/common.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2006-2017 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2013-2018 Intel Corp., Inc.  All rights reserved.
  * Copyright (c) 2015 Los Alamos Nat. Security, LLC. All rights reserved.
+ * Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -79,6 +80,8 @@ struct ofi_common_locks common_locks = {
 	.util_fabric_lock = PTHREAD_MUTEX_INITIALIZER,
 };
 
+size_t ofi_universe_size = 1024;
+
 int fi_poll_fd(int fd, int timeout)
 {
 	struct pollfd fds;
@@ -218,6 +221,20 @@ int ofi_check_rx_mode(const struct fi_info *info, uint64_t flags)
 	return (info->mode & flags) ? 1 : 0;
 }
 
+uint32_t ofi_generate_seed(void)
+{
+	/* Time returns long; keep the lower and most significant 32 bits */
+	uint32_t rand_seed;
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	rand_seed = ((getpid() & 0xffffffff) << 16);
+
+	/* Mix the PID into the upper bits */
+	rand_seed |= (uint32_t) tv.tv_usec;
+
+	return rand_seed;
+}
+
 uint64_t ofi_gettime_ns(void)
 {
 	struct timespec now;
@@ -267,6 +284,7 @@ const char *ofi_straddr(char *buf, size_t *len,
 	const struct sockaddr *sock_addr;
 	const struct sockaddr_in6 *sin6;
 	const struct sockaddr_in *sin;
+	const struct ofi_sockaddr_ib *sib;
 	char str[INET6_ADDRSTRLEN + 8];
 	size_t size;
 
@@ -315,7 +333,19 @@ sa_sin6:
 				str, *((uint16_t *)addr + 8), *((uint32_t *)addr + 5));
 		break;
 	case FI_SOCKADDR_IB:
-		size = snprintf(buf, *len, "fi_sockaddr_ib://%p", addr);
+		sib = addr;
+		memset(str, 0, sizeof(str));
+		if (!inet_ntop(AF_INET6, sib->sib_addr, str, INET6_ADDRSTRLEN))
+			return NULL;
+
+		size = snprintf(buf, *len, "fi_sockaddr_ib://[%s]" /* GID */
+			     ":0x%" PRIx16 /* P_Key */
+			     ":0x%" PRIx16 /* port space */
+			     ":0x%" PRIx8 /* Scope ID */,
+			     str, /* GID */
+			     ntohs(sib->sib_pkey), /* P_Key */
+			     (uint16_t)(ntohll(sib->sib_sid) >> 16) & 0xfff, /* port space */
+				 (uint8_t)ntohll(sib->sib_scope_id) & 0xff);
 		break;
 	case FI_ADDR_PSMX:
 		size = snprintf(buf, *len, "fi_addr_psmx://%" PRIx64,
@@ -363,7 +393,7 @@ sa_sin6:
 	return buf;
 }
 
-static uint32_t ofi_addr_format(const char *str)
+uint32_t ofi_addr_format(const char *str)
 {
 	char fmt[16];
 	int ret;
@@ -459,6 +489,101 @@ static int ofi_str_to_ib_ud(const char *str, void **addr, size_t *len)
 	return -FI_EINVAL;
 }
 
+static int ofi_str_to_sib(const char *str, void **addr, size_t *len)
+{
+	int ret;
+	char *tok, *endptr, *saveptr;
+	struct ofi_sockaddr_ib *sib;
+	uint16_t pkey;
+	uint16_t ps;
+	uint64_t scope_id;
+	uint16_t port;
+	char gid[64 + 1];
+	char extra_str[64 + 1];
+
+	memset(gid, 0, sizeof(gid));
+
+	ret = sscanf(str, "%*[^:]://[%64[^]]]" /* GID */
+		     ":%64s", /* P_Key : port_space : Scope ID : port */
+		     gid, extra_str);
+	if (ret != 2) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Invalid GID in address: %s\n", str);
+		return -FI_EINVAL;
+	}
+
+	tok = strtok_r(extra_str, ":", &saveptr);
+	if (!tok) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Invalid pkey in address: %s\n", str);
+		return -FI_EINVAL;
+	}
+
+	pkey = strtol(tok, &endptr, 0);
+	if (*endptr) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Invalid pkey in address: %s\n", str);
+		return -FI_EINVAL;
+	}
+
+	tok = strtok_r(NULL, ":", &saveptr);
+	if (!tok) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Invalid port space in address: %s\n", str);
+		return -FI_EINVAL;
+	}
+
+	ps = strtol(tok, &endptr, 0);
+	if (*endptr) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Invalid port space in address: %s\n", str);
+		return -FI_EINVAL;
+	}
+
+	tok = strtok_r(NULL, ":", &saveptr);
+	if (!tok) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Invalid scope id in address: %s\n", str);
+		return -FI_EINVAL;
+	}
+
+	scope_id = strtol(tok, &endptr, 0);
+	if (*endptr) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Invalid scope id in address: %s\n", str);
+		return -FI_EINVAL;
+	}
+
+	/* Port is optional */
+	tok = strtok_r(NULL, ":", &saveptr);
+	if (tok)
+		port = strtol(tok, &endptr, 0);
+	else
+		port = 0;
+
+	*len = sizeof(struct ofi_sockaddr_ib);
+	*addr = calloc(1, *len);
+	if (!*addr)
+		return -FI_ENOMEM;
+
+	sib = (struct ofi_sockaddr_ib *)(*addr);
+
+	if (inet_pton(AF_INET6, gid, sib->sib_addr) > 0) {
+		sib->sib_family = AF_IB;
+		sib->sib_pkey = htons(pkey);
+		if (ps && port) {
+			sib->sib_sid = htonll(((uint64_t) ps << 16) + port);
+			sib->sib_sid_mask = htonll(OFI_IB_IP_PS_MASK |
+			                           OFI_IB_IP_PORT_MASK);
+		}
+		sib->sib_scope_id = htonll(scope_id);
+		return FI_SUCCESS;
+	}
+
+	free(*addr);
+	return -FI_EINVAL;
+}
+
 static int ofi_str_to_efa(const char *str, void **addr, size_t *len)
 {
 	char gid[INET6_ADDRSTRLEN];
@@ -674,6 +799,7 @@ int ofi_str_toaddr(const char *str, uint32_t *addr_format,
 	case FI_ADDR_EFA:
 		return ofi_str_to_efa(str, addr, len);
 	case FI_SOCKADDR_IB:
+		return ofi_str_to_sib(str, addr, len);
 	case FI_ADDR_GNI:
 	case FI_ADDR_BGQ:
 	case FI_ADDR_MLX:
@@ -732,10 +858,10 @@ static int ofi_is_any_addr_port(struct sockaddr *addr)
 {
 	switch (ofi_sa_family(addr)) {
 	case AF_INET:
-		return (ofi_ipv4_is_any_addr(addr) &&
+		return (ofi_sin_is_any_addr(addr) &&
 			ofi_sin_port(addr));
 	case AF_INET6:
-		return (ofi_ipv6_is_any_addr(addr) &&
+		return (ofi_sin6_is_any_addr(addr) &&
 			ofi_sin6_port(addr));
 	default:
 		FI_WARN(&core_prov, FI_LOG_CORE,
@@ -828,9 +954,13 @@ void ofi_straddr_log_internal(const char *func, int line,
 	size_t len = sizeof(buf);
 
 	if (fi_log_enabled(prov, level, subsys)) {
-		addr_format = ofi_translate_addr_format(ofi_sa_family(addr));
-		fi_log(prov, level, subsys, func, line, "%s: %s\n", log_str,
-		       ofi_straddr(buf, &len, addr_format, addr));
+		if (addr) {
+			addr_format = ofi_translate_addr_format(ofi_sa_family(addr));
+			fi_log(prov, level, subsys, func, line, "%s: %s\n", log_str,
+			       ofi_straddr(buf, &len, addr_format, addr));
+		} else {
+			fi_log(prov, level, subsys, func, line, "%s: (null)\n", log_str);
+		}
 	}
 }
 
@@ -1086,10 +1216,11 @@ void ofi_insert_loopback_addr(const struct fi_provider *prov, struct slist *addr
 {
 	struct ofi_addr_list_entry *addr_entry;
 
-	addr_entry = calloc(1, sizeof(struct ofi_addr_list_entry));
+	addr_entry = calloc(1, sizeof(*addr_entry));
 	if (!addr_entry)
 		return;
 
+	addr_entry->comm_caps = FI_LOCAL_COMM;
 	addr_entry->ipaddr.sin.sin_family = AF_INET;
 	addr_entry->ipaddr.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
 	ofi_straddr_log(prov, FI_LOG_INFO, FI_LOG_CORE,
@@ -1100,10 +1231,11 @@ void ofi_insert_loopback_addr(const struct fi_provider *prov, struct slist *addr
 	strncpy(addr_entry->ifa_name, "lo", sizeof(addr_entry->ifa_name));
 	slist_insert_tail(&addr_entry->entry, addr_list);
 
-	addr_entry = calloc(1, sizeof(struct ofi_addr_list_entry));
+	addr_entry = calloc(1, sizeof(*addr_entry));
 	if (!addr_entry)
 		return;
 
+	addr_entry->comm_caps = FI_LOCAL_COMM;
 	addr_entry->ipaddr.sin6.sin6_family = AF_INET6;
 	addr_entry->ipaddr.sin6.sin6_addr = in6addr_loopback;
 	ofi_straddr_log(prov, FI_LOG_INFO, FI_LOG_CORE,
@@ -1228,10 +1360,11 @@ void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name,
 		if (!addr_entry)
 			continue;
 
+		addr_entry->comm_caps = FI_LOCAL_COMM | FI_REMOTE_COMM;
 		memcpy(&addr_entry->ipaddr, ifa->ifa_addr,
 			ofi_sizeofaddr(ifa->ifa_addr));
 		strncpy(addr_entry->ifa_name, ifa->ifa_name,
-			sizeof(addr_entry->ifa_name));
+			sizeof(addr_entry->ifa_name) - 1);
 		ofi_set_netmask_str(addr_entry->net_name,
 				    sizeof(addr_entry->net_name), ifa);
 
@@ -1292,6 +1425,7 @@ void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name,
 			if (!addr_entry)
 				break;
 
+			addr_entry->comm_caps = FI_LOCAL_COMM | FI_REMOTE_COMM;
 			addr_entry->ipaddr.sin.sin_family = AF_INET;
 			addr_entry->ipaddr.sin.sin_addr.s_addr =
 						iptbl->table[i].dwAddr;
@@ -1602,3 +1736,54 @@ fail:
 	ofi_nic_close(&dup_nic->fid);
 	return NULL;
 }
+
+/*
+ * Calculate bits per second based on verbs port active_speed and active_width.
+ */
+size_t ofi_vrb_speed(uint8_t speed, uint8_t width)
+{
+	const size_t gbit_2_bit_coef = 1000 * 1000 * 1000;
+	size_t width_val, speed_val;
+
+	switch (speed) {
+	case 1:
+		speed_val = (size_t) (2.5 * (float) gbit_2_bit_coef);
+		break;
+	case 2:
+		speed_val = 5 * gbit_2_bit_coef;
+		break;
+	case 4:
+	case 8:
+		speed_val = 8 * gbit_2_bit_coef;
+		break;
+	case 16:
+		speed_val = 14 * gbit_2_bit_coef;
+		break;
+	case 32:
+		speed_val = 25 * gbit_2_bit_coef;
+		break;
+	default:
+		speed_val = 0;
+		break;
+	}
+
+	switch (width) {
+	case 1:
+		width_val = 1;
+		break;
+	case 2:
+		width_val = 4;
+		break;
+	case 4:
+		width_val = 8;
+		break;
+	case 8:
+		width_val = 12;
+		break;
+	default:
+		width_val = 0;
+		break;
+	}
+
+	return width_val * speed_val;
+}
diff --git a/deps/libfabric/src/fabric.c b/deps/libfabric/src/fabric.c
index 01b8d71e54b98d2a3345e2bcc7af73d0f65b2be8..73de7336f269f6c1f8fb2c1c7928c0f71f889bc7 100644
--- a/deps/libfabric/src/fabric.c
+++ b/deps/libfabric/src/fabric.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
  * Copyright (c) 2006-2016 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2013-2017 Intel Corp., Inc.  All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -39,6 +40,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <dirent.h>
+#include <ctype.h>
 
 #include <rdma/fi_errno.h>
 #include "ofi_util.h"
@@ -46,6 +48,7 @@
 #include "shared/ofi_str.h"
 #include "ofi_prov.h"
 #include "ofi_perf.h"
+#include "ofi_hmem.h"
 
 #ifdef HAVE_LIBDL
 #include <dlfcn.h>
@@ -135,6 +138,13 @@ static enum ofi_prov_type ofi_prov_type(const struct fi_provider *provider)
 	return ctx->type;
 }
 
+static int ofi_disable_util_layering(const struct fi_provider *provider) {
+	const struct fi_prov_context *ctx;
+
+	ctx = (const struct fi_prov_context *) &provider->context;
+	return ctx->disable_layering;
+}
+
 static int ofi_is_util_prov(const struct fi_provider *provider)
 {
 	return ofi_prov_type(provider) == OFI_PROV_UTIL;
@@ -261,7 +271,7 @@ static struct ofi_prov *ofi_getprov(const char *prov_name, size_t len)
 
 	for (prov = prov_head; prov; prov = prov->next) {
 		if ((strlen(prov->prov_name) == len) &&
-		    !strncmp(prov->prov_name, prov_name, len))
+		    !strncasecmp(prov->prov_name, prov_name, len))
 			return prov;
 	}
 
@@ -360,7 +370,7 @@ static void ofi_ordered_provs_init(void)
 		 */
 
 		/* Before you add ANYTHING here, read the comment above!!! */
-		"UDP", "tcp", "sockets", /* NOTHING GOES HERE! */
+		"udp", "tcp", "sockets", /* NOTHING GOES HERE! */
 		/* Seriously, read it! */
 
 		/* These are hooking providers only.  Their order
@@ -434,6 +444,15 @@ static void ofi_register_provider(struct fi_provider *provider, void *dlhandle)
 	if (ofi_apply_filter(&prov_log_filter, provider->name))
 		ctx->disable_logging = 1;
 
+	/*
+	 * Prevent utility providers from layering on these core providers
+	 * unless explicitly requested.
+	 */
+	if (!strcasecmp(provider->name, "sockets") ||
+	    !strcasecmp(provider->name, "shm") ||
+	    !strcasecmp(provider->name, "efa") || ofi_is_util_prov(provider))
+		ctx->disable_layering = 1;
+
 	prov = ofi_getprov(provider->name, strlen(provider->name));
 	if (prov) {
 		/* If this provider has not been init yet, then we add the
@@ -535,23 +554,46 @@ void ofi_create_filter(struct fi_filter *filter, const char *raw_filter)
 	}
 
 	filter->names = ofi_split_and_alloc(raw_filter, ",", NULL);
-	if (!filter->names)
+	if (!filter->names) {
 		FI_WARN(&core_prov, FI_LOG_CORE,
 			"unable to parse filter from: %s\n", raw_filter);
+		return;
+	}
 
-	if(verify_filter_names(filter->names))
+	if (verify_filter_names(filter->names))
 		FI_WARN(&core_prov, FI_LOG_CORE,
 		        "unable to verify filter name\n");
 }
 
 #ifdef HAVE_LIBDL
+static void ofi_reg_dl_prov(const char *lib)
+{
+	void *dlhandle;
+	struct fi_provider* (*inif)(void);
+
+	FI_DBG(&core_prov, FI_LOG_CORE, "opening provider lib %s\n", lib);
+
+	dlhandle = dlopen(lib, RTLD_NOW);
+	if (dlhandle == NULL) {
+		FI_DBG(&core_prov, FI_LOG_CORE,
+			"dlopen(%s): %s\n", lib, dlerror());
+		return;
+	}
+
+	inif = dlsym(dlhandle, "fi_prov_ini");
+	if (inif == NULL) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "dlsym: %s\n", dlerror());
+		dlclose(dlhandle);
+	} else {
+		ofi_register_provider((inif)(), dlhandle);
+	}
+}
+
 static void ofi_ini_dir(const char *dir)
 {
 	int n = 0;
 	char *lib;
-	void *dlhandle;
 	struct dirent **liblist = NULL;
-	struct fi_provider* (*inif)(void);
 
 	n = scandir(dir, &liblist, lib_filter, NULL);
 	if (n < 0)
@@ -563,25 +605,10 @@ static void ofi_ini_dir(const char *dir)
 			       "asprintf failed to allocate memory\n");
 			goto libdl_done;
 		}
-		FI_DBG(&core_prov, FI_LOG_CORE, "opening provider lib %s\n", lib);
+		ofi_reg_dl_prov(lib);
 
-		dlhandle = dlopen(lib, RTLD_NOW);
 		free(liblist[n]);
-		if (dlhandle == NULL) {
-			FI_WARN(&core_prov, FI_LOG_CORE,
-			       "dlopen(%s): %s\n", lib, dlerror());
-			free(lib);
-			continue;
-		}
 		free(lib);
-
-		inif = dlsym(dlhandle, "fi_prov_ini");
-		if (inif == NULL) {
-			FI_WARN(&core_prov, FI_LOG_CORE, "dlsym: %s\n", dlerror());
-			dlclose(dlhandle);
-		} else {
-			ofi_register_provider((inif)(), dlhandle);
-		}
 	}
 
 libdl_done:
@@ -589,6 +616,39 @@ libdl_done:
 		free(liblist[n]);
 	free(liblist);
 }
+
+/* Search standard system library paths (i.e. LD_LIBRARY_PATH) for known DL provider
+ * libraries.
+ */
+static void ofi_find_prov_libs(void)
+{
+	const char* lib_prefix = "lib";
+	struct ofi_prov *prov;
+	char* lib;
+	char* short_prov_name;
+
+	for (prov = prov_head; prov; prov = prov->next) {
+
+		if (!prov->prov_name)
+			continue;
+
+		if (ofi_has_util_prefix(prov->prov_name)) {
+			short_prov_name = prov->prov_name + strlen(OFI_UTIL_PREFIX);
+		} else {
+			short_prov_name = prov->prov_name;
+		}
+
+		if (asprintf(&lib, "%s%s%s%s", lib_prefix,
+			short_prov_name, "-", FI_LIB_SUFFIX) < 0) {
+			FI_WARN(&core_prov, FI_LOG_CORE,
+				"asprintf failed to allocate memory\n");
+			continue;
+		}
+
+		ofi_reg_dl_prov(lib);
+		free(lib);
+	}
+}
 #endif
 
 void fi_ini(void)
@@ -608,7 +668,8 @@ void fi_ini(void)
 	ofi_pmem_init();
 	ofi_perf_init();
 	ofi_hook_init();
-	ofi_monitor_init();
+	ofi_hmem_init();
+	ofi_monitors_init();
 
 	fi_param_define(NULL, "provider", FI_PARAM_STRING,
 			"Only use specified provider (default: all available)");
@@ -621,7 +682,8 @@ void fi_ini(void)
 			"Defines the maximum number of processes that will be"
 			" used by distribute OFI application. The provider uses"
 			" this to optimize resource allocations"
-			" (default: OFI service specific)");
+			" (default: provider specific)");
+	fi_param_get_size_t(NULL, "universe_size", &ofi_universe_size);
 	fi_param_get_str(NULL, "provider", &param_val);
 	ofi_create_filter(&prov_filter, param_val);
 
@@ -643,9 +705,10 @@ void fi_ini(void)
 			"Search for providers in specific path (default: "
 			PROVDLDIR ")");
 	fi_param_get_str(NULL, "provider_path", &provdir);
-	if (!provdir)
+	if (!provdir) {
 		provdir = PROVDLDIR;
-
+		ofi_find_prov_libs();
+	}
 	dirs = ofi_split_and_alloc(provdir, ":", NULL);
 	if (dirs) {
 		for (n = 0; dirs[n]; ++n) {
@@ -699,7 +762,8 @@ FI_DESTRUCTOR(fi_fini(void))
 	}
 
 	ofi_free_filter(&prov_filter);
-	ofi_monitor_cleanup();
+	ofi_monitors_cleanup();
+	ofi_hmem_cleanup();
 	ofi_mem_fini();
 	fi_log_fini();
 	fi_param_fini();
@@ -812,8 +876,9 @@ static void ofi_set_prov_attr(struct fi_fabric_attr *attr,
  *    1b. If a utility provider is specified, return it over any* core provider.
  *    1c. If a core provider is specified, return any utility provider that can
  *        layer over it, plus the core provider itself, if possible.
- *    1d. A utility provider will not layer over the sockets provider unless the
- *        user explicitly requests that combination.
+ *    1d. A utility provider will not layer over a provider that has disabled
+ *        utility provider layering unless the user explicitly requests that
+ *        combination.
  *    1e. OFI_CORE_PROV_ONLY flag prevents utility providers layering over other
  *        utility providers.
  * 2. If both the providers are utility providers or if more than two providers
@@ -827,6 +892,7 @@ static int ofi_layering_ok(const struct fi_provider *provider,
 			   uint64_t flags)
 {
 	char *prov_name;
+	struct ofi_prov *core_ofi_prov;
 	int i;
 
 	/* Excluded providers must be at the end */
@@ -848,9 +914,9 @@ static int ofi_layering_ok(const struct fi_provider *provider,
 			return 0;
 		}
 
-		if ((count == 0) && !strcasecmp(provider->name, "sockets")) {
+		if ((count == 0) && ofi_disable_util_layering(provider)) {
 			FI_INFO(&core_prov, FI_LOG_CORE,
-				"Skipping util;sockets layering\n");
+				"Skipping util;%s layering\n", provider->name);
 			return 0;
 		}
 	}
@@ -865,14 +931,12 @@ static int ofi_layering_ok(const struct fi_provider *provider,
 
 	if ((count == 1) && ofi_is_util_prov(provider) &&
 	    !ofi_has_util_prefix(prov_vec[0])) {
-		if (!strcasecmp(prov_vec[0], "sockets")) {
+		core_ofi_prov = ofi_getprov(prov_vec[0], strlen(prov_vec[0]));
+		if (core_ofi_prov && core_ofi_prov->provider &&
+		    ofi_disable_util_layering(core_ofi_prov->provider)) {
 			FI_INFO(&core_prov, FI_LOG_CORE,
-				"Sockets requested, skipping util layering\n");
-			return 0;
-		}
-		if (!strcasecmp(prov_vec[0], "shm")) {
-			FI_INFO(&core_prov, FI_LOG_CORE,
-				"Shm requested, skipping util layering\n");
+				"Skipping %s;%s layering\n", prov_vec[0],
+				provider->name);
 			return 0;
 		}
 		return 1;
diff --git a/deps/libfabric/src/fi_tostr.c b/deps/libfabric/src/fi_tostr.c
index 344ee6a6b464b0a22eef3b4ba5c4df9fb7a75af3..4a0370e965cb824c8509154cca5bf2cf418431e4 100644
--- a/deps/libfabric/src/fi_tostr.c
+++ b/deps/libfabric/src/fi_tostr.c
@@ -96,6 +96,7 @@ static void ofi_tostr_opflags(char *buf, uint64_t flags)
 	IFFLAGSTR(flags, FI_INJECT_COMPLETE);
 	IFFLAGSTR(flags, FI_TRANSMIT_COMPLETE);
 	IFFLAGSTR(flags, FI_DELIVERY_COMPLETE);
+	IFFLAGSTR(flags, FI_MATCH_COMPLETE);
 	IFFLAGSTR(flags, FI_AFFINITY);
 
 	IFFLAGSTR(flags, FI_CLAIM);
@@ -682,6 +683,19 @@ static void ofi_tostr_cq_event_flags(char *buf, uint64_t flags)
 	ofi_remove_comma(buf);
 }
 
+static void ofi_tostr_hmem_iface(char *buf, enum fi_hmem_iface iface)
+{
+	switch (iface) {
+	CASEENUMSTR(FI_HMEM_SYSTEM);
+	CASEENUMSTR(FI_HMEM_CUDA);
+	CASEENUMSTR(FI_HMEM_ROCR);
+	CASEENUMSTR(FI_HMEM_ZE);
+	default:
+		ofi_strcatf(buf, "Unknown");
+		break;
+	}
+}
+
 __attribute__((visibility ("default"),EXTERNALLY_VISIBLE))
 char *DEFAULT_SYMVER_PRE(fi_tostr)(const void *data, enum fi_type datatype)
 {
@@ -781,6 +795,9 @@ char *DEFAULT_SYMVER_PRE(fi_tostr)(const void *data, enum fi_type datatype)
 	case FI_TYPE_COLLECTIVE_OP:
 		ofi_tostr_collective_op(buf, *enumval);
 		break;
+	case FI_TYPE_HMEM_IFACE:
+		ofi_tostr_hmem_iface(buf, *enumval);
+		break;
 	default:
 		ofi_strcatf(buf, "Unknown type");
 		break;
diff --git a/deps/libfabric/src/hmem.c b/deps/libfabric/src/hmem.c
new file mode 100644
index 0000000000000000000000000000000000000000..4e70a7c3cdc94e4bbcc6eebe58c595b32f0e99dc
--- /dev/null
+++ b/deps/libfabric/src/hmem.c
@@ -0,0 +1,240 @@
+/*
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "ofi_hmem.h"
+#include "ofi.h"
+#include "ofi_iov.h"
+
+struct ofi_hmem_ops {
+	bool initialized;
+	int (*init)(void);
+	int (*cleanup)(void);
+	int (*copy_to_hmem)(uint64_t device, void *dest, const void *src,
+			    size_t size);
+	int (*copy_from_hmem)(uint64_t device, void *dest, const void *src,
+			      size_t size);
+	bool (*is_addr_valid)(const void *addr);
+	int (*get_handle)(void *dev_buf, void **handle);
+	int (*open_handle)(void **handle, uint64_t device, void **ipc_ptr);
+	int (*close_handle)(void *ipc_ptr);
+};
+
+static struct ofi_hmem_ops hmem_ops[] = {
+	[FI_HMEM_SYSTEM] = {
+		.initialized = false,
+		.init = ofi_hmem_init_noop,
+		.cleanup = ofi_hmem_cleanup_noop,
+		.copy_to_hmem = ofi_memcpy,
+		.copy_from_hmem = ofi_memcpy,
+		.get_handle = ofi_hmem_no_get_handle,
+		.open_handle = ofi_hmem_no_open_handle,
+		.close_handle = ofi_hmem_no_close_handle,
+	},
+	[FI_HMEM_CUDA] = {
+		.initialized = false,
+		.init = cuda_hmem_init,
+		.cleanup = cuda_hmem_cleanup,
+		.copy_to_hmem = cuda_copy_to_dev,
+		.copy_from_hmem = cuda_copy_from_dev,
+		.is_addr_valid = cuda_is_addr_valid,
+		.get_handle = ofi_hmem_no_get_handle,
+		.open_handle = ofi_hmem_no_open_handle,
+		.close_handle = ofi_hmem_no_close_handle,
+	},
+	[FI_HMEM_ROCR] = {
+		.initialized = false,
+		.init = rocr_hmem_init,
+		.cleanup = rocr_hmem_cleanup,
+		.copy_to_hmem = rocr_memcpy,
+		.copy_from_hmem = rocr_memcpy,
+		.is_addr_valid = rocr_is_addr_valid,
+		.get_handle = ofi_hmem_no_get_handle,
+		.open_handle = ofi_hmem_no_open_handle,
+		.close_handle = ofi_hmem_no_close_handle,
+	},
+	[FI_HMEM_ZE] = {
+		.initialized = false,
+		.init = ze_hmem_init,
+		.cleanup = ze_hmem_cleanup,
+		.copy_to_hmem = ze_hmem_copy,
+		.copy_from_hmem = ze_hmem_copy,
+		.is_addr_valid = ze_is_addr_valid,
+		.get_handle = ze_hmem_get_handle,
+		.open_handle = ze_hmem_open_handle,
+		.close_handle = ze_hmem_close_handle,
+	},
+};
+
+static inline int ofi_copy_to_hmem(enum fi_hmem_iface iface, uint64_t device,
+				   void *dest, const void *src, size_t size)
+{
+	return hmem_ops[iface].copy_to_hmem(device, dest, src, size);
+}
+
+static inline int ofi_copy_from_hmem(enum fi_hmem_iface iface, uint64_t device,
+				     void *dest, const void *src, size_t size)
+{
+	return hmem_ops[iface].copy_from_hmem(device, dest, src, size);
+}
+
+static ssize_t ofi_copy_hmem_iov_buf(enum fi_hmem_iface hmem_iface, uint64_t device,
+				     const struct iovec *hmem_iov,
+				     size_t hmem_iov_count,
+				     uint64_t hmem_iov_offset, void *buf,
+				     size_t size, int dir)
+{
+	uint64_t done = 0, len;
+	char *hmem_buf;
+	size_t i;
+	int ret;
+
+	for (i = 0; i < hmem_iov_count && size; i++) {
+		len = hmem_iov[i].iov_len;
+
+		if (hmem_iov_offset > len) {
+			hmem_iov_offset -= len;
+			continue;
+		}
+
+		hmem_buf = (char *)hmem_iov[i].iov_base + hmem_iov_offset;
+		len -= hmem_iov_offset;
+
+		len = MIN(len, size);
+		if (dir == OFI_COPY_BUF_TO_IOV)
+			ret = ofi_copy_to_hmem(hmem_iface, device, hmem_buf,
+					       (char *)buf + done, len);
+		else
+			ret = ofi_copy_from_hmem(hmem_iface, device,
+						 (char *)buf + done, hmem_buf,
+						 len);
+
+		if (ret)
+			return ret;
+
+		hmem_iov_offset = 0;
+		size -= len;
+		done += len;
+	}
+	return done;
+}
+
+ssize_t ofi_copy_from_hmem_iov(void *dest, size_t size,
+			       enum fi_hmem_iface hmem_iface, uint64_t device,
+			       const struct iovec *hmem_iov,
+			       size_t hmem_iov_count,
+			       uint64_t hmem_iov_offset)
+{
+	return ofi_copy_hmem_iov_buf(hmem_iface, device, hmem_iov,
+				     hmem_iov_count, hmem_iov_offset,
+				     dest, size, OFI_COPY_IOV_TO_BUF);
+}
+
+ssize_t ofi_copy_to_hmem_iov(enum fi_hmem_iface hmem_iface, uint64_t device,
+			     const struct iovec *hmem_iov,
+			     size_t hmem_iov_count, uint64_t hmem_iov_offset,
+			     void *src, size_t size)
+{
+	return ofi_copy_hmem_iov_buf(hmem_iface, device, hmem_iov,
+				     hmem_iov_count, hmem_iov_offset,
+				     src, size, OFI_COPY_BUF_TO_IOV);
+}
+
+int ofi_hmem_get_handle(enum fi_hmem_iface iface, void *dev_buf, void **handle)
+{
+	return hmem_ops[iface].get_handle(dev_buf, handle);
+}
+
+int ofi_hmem_open_handle(enum fi_hmem_iface iface, void **handle,
+			 uint64_t device, void **ipc_ptr)
+{
+	return hmem_ops[iface].open_handle(handle, device, ipc_ptr);
+}
+
+int ofi_hmem_close_handle(enum fi_hmem_iface iface, void *ipc_ptr)
+{
+	return hmem_ops[iface].close_handle(ipc_ptr);
+}
+
+void ofi_hmem_init(void)
+{
+	enum fi_hmem_iface iface;
+	int ret;
+
+	for (iface = 0; iface < ARRAY_SIZE(hmem_ops); iface++) {
+		ret = hmem_ops[iface].init();
+		if (ret != FI_SUCCESS) {
+			if (ret == -FI_ENOSYS)
+				FI_INFO(&core_prov, FI_LOG_CORE,
+					"Hmem iface %s not supported\n",
+					fi_tostr(&iface, FI_TYPE_HMEM_IFACE));
+			else
+				FI_WARN(&core_prov, FI_LOG_CORE,
+					"Failed to initialize hmem iface %s: %s\n",
+					fi_tostr(&iface, FI_TYPE_HMEM_IFACE),
+					fi_strerror(-ret));
+		} else {
+			hmem_ops[iface].initialized = true;
+		}
+	}
+}
+
+void ofi_hmem_cleanup(void)
+{
+	enum fi_hmem_iface iface;
+
+	for (iface = 0; iface < ARRAY_SIZE(hmem_ops); iface++) {
+		if (hmem_ops[iface].initialized)
+			hmem_ops[iface].cleanup();
+	}
+}
+
+enum fi_hmem_iface ofi_get_hmem_iface(const void *addr)
+{
+	enum fi_hmem_iface iface;
+
+	/* Since a is_addr_valid function is not implemented for FI_HMEM_SYSTEM,
+	 * HMEM iface is skipped. In addition, if no other HMEM ifaces claim the
+	 * address as valid, it is assumed the address is FI_HMEM_SYSTEM.
+	 */
+	for (iface = ARRAY_SIZE(hmem_ops) - 1; iface > FI_HMEM_SYSTEM;
+	     iface--) {
+		if (hmem_ops[iface].initialized &&
+		    hmem_ops[iface].is_addr_valid(addr))
+			return iface;
+	}
+
+	return FI_HMEM_SYSTEM;
+}
diff --git a/deps/libfabric/src/hmem_cuda.c b/deps/libfabric/src/hmem_cuda.c
new file mode 100644
index 0000000000000000000000000000000000000000..a7d1e73a1c528ad2e6ea2192a884900efb525894
--- /dev/null
+++ b/deps/libfabric/src/hmem_cuda.c
@@ -0,0 +1,278 @@
+/*
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "ofi_hmem.h"
+#include "ofi.h"
+
+#if HAVE_LIBCUDA
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct cuda_ops {
+	cudaError_t (*cudaMemcpy)(void *dst, const void *src, size_t count,
+				  enum cudaMemcpyKind kind);
+	const char *(*cudaGetErrorName)(cudaError_t error);
+	const char *(*cudaGetErrorString)(cudaError_t error);
+	CUresult (*cuPointerGetAttribute)(void *data,
+					  CUpointer_attribute attribute,
+					  CUdeviceptr ptr);
+};
+
+#ifdef ENABLE_CUDA_DLOPEN
+
+#include <dlfcn.h>
+
+static void *cudart_handle;
+static void *cuda_handle;
+static struct cuda_ops cuda_ops;
+
+#else
+
+static struct cuda_ops cuda_ops = {
+	.cudaMemcpy = cudaMemcpy,
+	.cudaGetErrorName = cudaGetErrorName,
+	.cudaGetErrorString = cudaGetErrorString,
+	.cuPointerGetAttribute = cuPointerGetAttribute,
+};
+
+#endif /* ENABLE_CUDA_DLOPEN */
+
+cudaError_t ofi_cudaMemcpy(void *dst, const void *src, size_t count,
+			   enum cudaMemcpyKind kind)
+{
+	return cuda_ops.cudaMemcpy(dst, src, count, kind);
+}
+
+const char *ofi_cudaGetErrorName(cudaError_t error)
+{
+	return cuda_ops.cudaGetErrorName(error);
+}
+
+const char *ofi_cudaGetErrorString(cudaError_t error)
+{
+	return cuda_ops.cudaGetErrorString(error);
+}
+
+CUresult ofi_cuPointerGetAttribute(void *data, CUpointer_attribute attribute,
+				   CUdeviceptr ptr)
+{
+	return cuda_ops.cuPointerGetAttribute(data, attribute, ptr);
+}
+
+int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size)
+{
+	cudaError_t cuda_ret;
+
+	cuda_ret = ofi_cudaMemcpy(dev, host, size, cudaMemcpyHostToDevice);
+	if (cuda_ret == cudaSuccess)
+		return 0;
+
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"Failed to perform cudaMemcpy: %s:%s\n",
+		ofi_cudaGetErrorName(cuda_ret),
+		ofi_cudaGetErrorString(cuda_ret));
+
+	return -FI_EIO;
+}
+
+int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size)
+{
+	cudaError_t cuda_ret;
+
+	cuda_ret = ofi_cudaMemcpy(host, dev, size, cudaMemcpyDeviceToHost);
+	if (cuda_ret == cudaSuccess)
+		return 0;
+
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"Failed to perform cudaMemcpy: %s:%s\n",
+		ofi_cudaGetErrorName(cuda_ret),
+		ofi_cudaGetErrorString(cuda_ret));
+
+	return -FI_EIO;
+}
+
+static int cuda_hmem_dl_init(void)
+{
+#ifdef ENABLE_CUDA_DLOPEN
+	/* Assume failure to dlopen CUDA runtime is caused by the library not
+	 * being found. Thus, CUDA is not supported.
+	 */
+	cudart_handle = dlopen("libcudart.so", RTLD_NOW);
+	if (!cudart_handle) {
+		FI_INFO(&core_prov, FI_LOG_CORE,
+			"Failed to dlopen libcudart.so\n");
+		return -FI_ENOSYS;
+	}
+
+	cuda_handle = dlopen("libcuda.so", RTLD_NOW);
+	if (!cuda_handle) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to dlopen libcuda.so\n");
+		goto err_dlclose_cudart;
+	}
+
+	cuda_ops.cudaMemcpy = dlsym(cudart_handle, "cudaMemcpy");
+	if (!cuda_ops.cudaMemcpy) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find cudaMemcpy\n");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaGetErrorName = dlsym(cudart_handle, "cudaGetErrorName");
+	if (!cuda_ops.cudaGetErrorName) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find cudaGetErrorName\n");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cudaGetErrorString = dlsym(cudart_handle,
+					    "cudaGetErrorString");
+	if (!cuda_ops.cudaGetErrorString) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find cudaGetErrorString\n");
+		goto err_dlclose_cuda;
+	}
+
+	cuda_ops.cuPointerGetAttribute = dlsym(cuda_handle,
+					       "cuPointerGetAttribute");
+	if (!cuda_ops.cuPointerGetAttribute) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find cuPointerGetAttribute\n");
+		goto err_dlclose_cuda;
+	}
+
+	return FI_SUCCESS;
+
+err_dlclose_cuda:
+	dlclose(cuda_handle);
+err_dlclose_cudart:
+	dlclose(cudart_handle);
+
+	return -FI_ENODATA;
+#else
+	return FI_SUCCESS;
+#endif /* ENABLE_CUDA_DLOPEN */
+}
+
+int cuda_hmem_init(void)
+{
+	return cuda_hmem_dl_init();
+}
+
+int cuda_hmem_cleanup(void)
+{
+#ifdef ENABLE_CUDA_DLOPEN
+	dlclose(cuda_handle);
+	dlclose(cudart_handle);
+#endif
+
+	return FI_SUCCESS;
+}
+
+bool cuda_is_addr_valid(const void *addr)
+{
+	CUresult cuda_ret;
+	unsigned int data;
+
+	cuda_ret = ofi_cuPointerGetAttribute(&data,
+					     CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+					     (CUdeviceptr)addr);
+	switch (cuda_ret) {
+	case CUDA_SUCCESS:
+		if (data == CU_MEMORYTYPE_DEVICE)
+			return true;
+		break;
+
+	/* Returned if the buffer is not associated with the CUcontext support
+	 * unified virtual addressing. Since host buffers may fall into this
+	 * category, this is not treated as an error.
+	 */
+	case CUDA_ERROR_INVALID_VALUE:
+		break;
+
+	/* Returned if cuInit() has not been called. This can happen if support
+	 * for CUDA is enabled but the user has not made a CUDA call. This is
+	 * not treated as an error.
+	 */
+	case CUDA_ERROR_NOT_INITIALIZED:
+		break;
+
+	/* Returned if the CUcontext does not support unified virtual
+	 * addressing.
+	 */
+	case CUDA_ERROR_INVALID_CONTEXT:
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"CUcontext does not support unified virtual addressining\n");
+		break;
+
+	default:
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Unhandle cuPointerGetAttribute return code: ret=%d\n",
+			cuda_ret);
+		break;
+	}
+
+	return false;
+}
+
+#else
+
+int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+int cuda_hmem_init(void)
+{
+	return -FI_ENOSYS;
+}
+
+int cuda_hmem_cleanup(void)
+{
+	return -FI_ENOSYS;
+}
+
+bool cuda_is_addr_valid(const void *addr)
+{
+	return false;
+}
+
+#endif /* HAVE_LIBCUDA */
diff --git a/deps/libfabric/src/hmem_rocr.c b/deps/libfabric/src/hmem_rocr.c
new file mode 100644
index 0000000000000000000000000000000000000000..99eca4210ecc78c7f347b3ed4f8ca2691c6f0dc7
--- /dev/null
+++ b/deps/libfabric/src/hmem_rocr.c
@@ -0,0 +1,333 @@
+/*
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "ofi_hmem.h"
+#include "ofi.h"
+
+#ifdef HAVE_ROCR
+
+#include <hsa/hsa_ext_amd.h>
+
+struct rocr_ops {
+	hsa_status_t (*hsa_memory_copy)(void *dst, const void *src,
+					size_t size);
+	hsa_status_t (*hsa_amd_pointer_info)(void *ptr,
+					     hsa_amd_pointer_info_t *info,
+					     void *(*alloc)(size_t),
+					     uint32_t *num_agents_accessible,
+					     hsa_agent_t **accessible);
+	hsa_status_t (*hsa_init)(void);
+	hsa_status_t (*hsa_shut_down)(void);
+	hsa_status_t (*hsa_status_string)(hsa_status_t status,
+					  const char **status_string);
+	hsa_status_t (*hsa_amd_dereg_dealloc_cb)(void *ptr,
+						 hsa_amd_deallocation_callback_t cb);
+	hsa_status_t (*hsa_amd_reg_dealloc_cb)(void *ptr,
+					       hsa_amd_deallocation_callback_t cb,
+					       void *user_data);
+};
+
+#ifdef ENABLE_ROCR_DLOPEN
+
+#include <dlfcn.h>
+
+static void *rocr_handle;
+static struct rocr_ops rocr_ops;
+
+#else
+
+static struct rocr_ops rocr_ops = {
+	.hsa_memory_copy = hsa_memory_copy,
+	.hsa_amd_pointer_info = hsa_amd_pointer_info,
+	.hsa_init = hsa_init,
+	.hsa_shut_down = hsa_shut_down,
+	.hsa_status_string = hsa_status_string,
+	.hsa_amd_dereg_dealloc_cb =
+		hsa_amd_deregister_deallocation_callback,
+	.hsa_amd_reg_dealloc_cb =
+		hsa_amd_register_deallocation_callback,
+};
+
+#endif /* ENABLE_ROCR_DLOPEN */
+
+hsa_status_t ofi_hsa_memory_copy(void *dst, const void *src, size_t size)
+{
+	return rocr_ops.hsa_memory_copy(dst, src, size);
+}
+
+hsa_status_t ofi_hsa_amd_pointer_info(void *ptr, hsa_amd_pointer_info_t *info,
+				      void *(*alloc)(size_t),
+				      uint32_t *num_agents_accessible,
+				      hsa_agent_t **accessible)
+{
+	return rocr_ops.hsa_amd_pointer_info(ptr, info, alloc,
+					     num_agents_accessible, accessible);
+}
+
+hsa_status_t ofi_hsa_init(void)
+{
+	return rocr_ops.hsa_init();
+}
+
+hsa_status_t ofi_hsa_shut_down(void)
+{
+	return rocr_ops.hsa_shut_down();
+}
+
+hsa_status_t ofi_hsa_status_string(hsa_status_t status,
+				   const char **status_string)
+{
+	return rocr_ops.hsa_status_string(status, status_string);
+}
+
+const char *ofi_hsa_status_to_string(hsa_status_t status)
+{
+	const char *str;
+	hsa_status_t hsa_ret;
+
+	hsa_ret = ofi_hsa_status_string(status, &str);
+	if (hsa_ret != HSA_STATUS_SUCCESS)
+		return "unknown error";
+
+	return str;
+}
+
+hsa_status_t ofi_hsa_amd_dereg_dealloc_cb(void *ptr,
+					  hsa_amd_deallocation_callback_t cb)
+{
+	return rocr_ops.hsa_amd_dereg_dealloc_cb(ptr, cb);
+}
+
+hsa_status_t ofi_hsa_amd_reg_dealloc_cb(void *ptr,
+					hsa_amd_deallocation_callback_t cb,
+					void *user_data)
+{
+	return rocr_ops.hsa_amd_reg_dealloc_cb(ptr, cb, user_data);
+}
+
+int rocr_memcpy(uint64_t device, void *dest, const void *src, size_t size)
+{
+	hsa_status_t hsa_ret;
+
+	hsa_ret = ofi_hsa_memory_copy(dest, src, size);
+	if (hsa_ret == HSA_STATUS_SUCCESS)
+		return 0;
+
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"Failed to perform hsa_memory_copy: %s\n",
+		ofi_hsa_status_to_string(hsa_ret));
+
+	return -FI_EIO;
+}
+
+bool rocr_is_addr_valid(const void *addr)
+{
+	hsa_amd_pointer_info_t hsa_info = {
+		.size = sizeof(hsa_info),
+	};
+	hsa_status_t hsa_ret;
+
+	hsa_ret = ofi_hsa_amd_pointer_info((void *)addr, &hsa_info, NULL, NULL,
+					   NULL);
+	if (hsa_ret == HSA_STATUS_SUCCESS) {
+		if (hsa_info.type == HSA_EXT_POINTER_TYPE_HSA)
+			return true;
+	} else {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to perform hsa_amd_pointer_info: %s\n",
+			ofi_hsa_status_to_string(hsa_ret));
+	}
+
+	return false;
+}
+
+static int rocr_hmem_dl_init(void)
+{
+#ifdef ENABLE_ROCR_DLOPEN
+	/* Assume if dlopen fails, the ROCR library could not be found. Do not
+	 * treat this as an error.
+	 */
+	rocr_handle = dlopen("libhsa-runtime64.so", RTLD_NOW);
+	if (!rocr_handle) {
+		FI_INFO(&core_prov, FI_LOG_CORE,
+			"Unable to dlopen libhsa-runtime64.so\n");
+		return -FI_ENOSYS;
+	}
+
+	rocr_ops.hsa_memory_copy = dlsym(rocr_handle, "hsa_memory_copy");
+	if (!rocr_ops.hsa_memory_copy) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find hsa_memory_copy\n");
+		goto err;
+	}
+
+	rocr_ops.hsa_amd_pointer_info = dlsym(rocr_handle,
+					      "hsa_amd_pointer_info");
+	if (!rocr_ops.hsa_amd_pointer_info) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find hsa_amd_pointer_info\n");
+		goto err;
+	}
+
+	rocr_ops.hsa_init = dlsym(rocr_handle, "hsa_init");
+	if (!rocr_ops.hsa_init) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find hsa_init\n");
+		goto err;
+	}
+
+	rocr_ops.hsa_shut_down = dlsym(rocr_handle, "hsa_shut_down");
+	if (!rocr_ops.hsa_shut_down) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find hsa_shut_down\n");
+		goto err;
+	}
+
+	rocr_ops.hsa_status_string = dlsym(rocr_handle, "hsa_status_string");
+	if (!rocr_ops.hsa_status_string) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find hsa_status_string\n");
+		goto err;
+	}
+
+	rocr_ops.hsa_amd_dereg_dealloc_cb =
+		dlsym(rocr_handle, "hsa_amd_deregister_deallocation_callback");
+	if (!rocr_ops.hsa_amd_dereg_dealloc_cb) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find hsa_amd_deregister_deallocation_callback\n");
+		goto err;
+	}
+
+	rocr_ops.hsa_amd_reg_dealloc_cb =
+		dlsym(rocr_handle, "hsa_amd_register_deallocation_callback");
+	if (!rocr_ops.hsa_amd_reg_dealloc_cb) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to find hsa_amd_register_deallocation_callback\n");
+		goto err;
+	}
+
+	return FI_SUCCESS;
+
+err:
+	dlclose(rocr_handle);
+
+	return -FI_ENODATA;
+#else
+	return FI_SUCCESS;
+#endif /* ENABLE_ROCR_DLOPEN */
+}
+
+static void rocr_hmem_dl_cleanup(void)
+{
+#ifdef ENABLE_ROCR_DLOPEN
+	dlclose(rocr_handle);
+#endif
+}
+
+int rocr_hmem_init(void)
+{
+	hsa_status_t hsa_ret;
+	int ret;
+	int log_level;
+
+	ret = rocr_hmem_dl_init();
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	hsa_ret = ofi_hsa_init();
+	if (hsa_ret == HSA_STATUS_SUCCESS)
+		return FI_SUCCESS;
+
+	/* Treat HSA_STATUS_ERROR_OUT_OF_RESOURCES as ROCR not being supported
+	 * instead of an error. This ROCR error is typically returned if no
+	 * devices are supported.
+	 */
+	if (hsa_ret == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
+		log_level = FI_LOG_INFO;
+		ret = -FI_ENOSYS;
+	} else {
+		log_level = FI_LOG_WARN;
+		ret = -FI_EIO;
+	}
+
+	FI_LOG(&core_prov, log_level, FI_LOG_CORE,
+	       "Failed to perform hsa_init: %s\n",
+	       ofi_hsa_status_to_string(hsa_ret));
+
+	rocr_hmem_dl_cleanup();
+
+	return ret;
+}
+
+int rocr_hmem_cleanup(void)
+{
+	hsa_status_t hsa_ret;
+
+	hsa_ret = ofi_hsa_shut_down();
+	if (hsa_ret != HSA_STATUS_SUCCESS) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to perform hsa_shut_down: %s\n",
+			ofi_hsa_status_to_string(hsa_ret));
+		return -FI_ENODATA;
+	}
+
+	rocr_hmem_dl_cleanup();
+
+	return FI_SUCCESS;
+}
+
+#else
+
+int rocr_memcpy(uint64_t device, void *dest, const void *src, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+int rocr_hmem_init(void)
+{
+	return -FI_ENOSYS;
+}
+
+int rocr_hmem_cleanup(void)
+{
+	return -FI_ENOSYS;
+}
+
+bool rocr_is_addr_valid(const void *addr)
+{
+	return false;
+}
+
+#endif /* HAVE_ROCR */
diff --git a/deps/libfabric/src/hmem_ze.c b/deps/libfabric/src/hmem_ze.c
new file mode 100644
index 0000000000000000000000000000000000000000..892699a68b3ce023ba103cfb50845be8c9ea59bb
--- /dev/null
+++ b/deps/libfabric/src/hmem_ze.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "ofi_hmem.h"
+#include "ofi.h"
+
+#ifdef HAVE_LIBZE
+
+#include <level_zero/ze_api.h>
+
+#define ZE_MAX_DEVICES 4
+
+static ze_context_handle_t context;
+static ze_device_handle_t devices[ZE_MAX_DEVICES];
+static ze_command_queue_handle_t cmd_queue[ZE_MAX_DEVICES];
+static int num_devices = 0;
+
+static const ze_command_queue_desc_t cq_desc = {
+	.stype		= ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+	.pNext		= NULL,
+	.ordinal	= 0,
+	.index		= 0,
+	.flags		= 0,
+	.mode		= ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
+	.priority	= ZE_COMMAND_QUEUE_PRIORITY_NORMAL,
+};
+
+static const ze_command_list_desc_t cl_desc = {
+	.stype				= ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
+	.pNext				= NULL,
+	.commandQueueGroupOrdinal	= 0,
+	.flags				= 0,
+};
+
+int ze_hmem_init(void)
+{
+	ze_driver_handle_t driver;
+	ze_context_desc_t context_desc = {0};
+	ze_result_t ze_ret;
+	uint32_t count;
+
+	ze_ret = zeInit(ZE_INIT_FLAG_GPU_ONLY);
+	if (ze_ret)
+		return -FI_EIO;
+
+	count = 1;
+	ze_ret = zeDriverGet(&count, &driver);
+	if (ze_ret)
+		return -FI_EIO;
+
+	ze_ret = zeContextCreate(driver, &context_desc, &context);
+	if (ze_ret)
+		return -FI_EIO;
+
+	count = 0;
+	ze_ret = zeDeviceGet(driver, &count, NULL);
+	if (ze_ret || count > ZE_MAX_DEVICES)
+		goto err;
+
+	ze_ret = zeDeviceGet(driver, &count, devices);
+	if (ze_ret)
+		goto err;
+
+	for (num_devices = 0; num_devices < count; num_devices++) {
+		ze_ret = zeCommandQueueCreate(context, devices[num_devices], &cq_desc,
+					      &cmd_queue[num_devices]);
+		if (ze_ret)
+			goto err;
+	}
+
+	return FI_SUCCESS;
+
+err:
+	(void) ze_hmem_cleanup();
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"Failed to initialize ZE driver resources\n");
+
+	return -FI_EIO;
+}
+
+int ze_hmem_cleanup(void)
+{
+	int i, ret = FI_SUCCESS;
+
+	for (i = 0; i < num_devices; i++) {
+		if (cmd_queue[i] && zeCommandQueueDestroy(cmd_queue[i])) {
+			FI_WARN(&core_prov, FI_LOG_CORE,
+				"Failed to destroy ZE cmd_queue\n");
+			ret = -FI_EINVAL;
+		}
+	}
+
+	if (zeContextDestroy(context))
+		return -FI_EINVAL;
+
+	return ret;
+}
+
+int ze_hmem_copy(uint64_t device, void *dst, const void *src, size_t size)
+{
+	ze_command_list_handle_t cmd_list;
+	ze_result_t ze_ret;
+	int dev_id = (int) device;
+
+	ze_ret = zeCommandListCreate(context, devices[dev_id], &cl_desc, &cmd_list);
+	if (ze_ret)
+		goto err;
+
+	ze_ret = zeCommandListAppendMemoryCopy(cmd_list, dst, src, size, NULL, 0, NULL);
+	if (ze_ret)
+		goto free;
+
+	ze_ret = zeCommandListClose(cmd_list);
+	if (ze_ret)
+		goto free;
+
+	ze_ret = zeCommandQueueExecuteCommandLists(cmd_queue[dev_id], 1,
+						   &cmd_list, NULL);
+
+free:
+	if (!zeCommandListDestroy(cmd_list) && !ze_ret)
+		return FI_SUCCESS;
+err:
+	FI_WARN(&core_prov, FI_LOG_CORE,
+		"Failed to perform ze copy (%d)\n", ze_ret);
+
+	return -FI_EIO;
+}
+
+bool ze_is_addr_valid(const void *addr)
+{
+	ze_result_t ze_ret;
+	ze_memory_allocation_properties_t mem_prop;
+	int i;
+
+	for (i = 0; i < num_devices; i++) {
+		ze_ret = zeMemGetAllocProperties(context, addr, &mem_prop,
+						 &devices[i]);
+		if (!ze_ret && mem_prop.type == ZE_MEMORY_TYPE_DEVICE)
+			return true;
+	}
+	return false;
+}
+
+int ze_hmem_get_handle(void *dev_buf, void **handle)
+{
+	ze_result_t ze_ret;
+
+	ze_ret = zeMemGetIpcHandle(context, dev_buf,
+				   (ze_ipc_mem_handle_t *) handle);
+	if (ze_ret) {
+		FI_WARN(&core_prov, FI_LOG_CORE, "Unable to get handle\n");
+		return -FI_EINVAL;
+	}
+
+	return FI_SUCCESS;
+}
+
+int ze_hmem_open_handle(void **handle, uint64_t device, void **ipc_ptr)
+{
+	ze_result_t ze_ret;
+
+	ze_ret = zeMemOpenIpcHandle(context, devices[device],
+				    *((ze_ipc_mem_handle_t *) handle),
+				    0, ipc_ptr);
+	if (ze_ret) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Unable to open memory handle\n");
+		return -FI_EINVAL;
+	}
+
+	return FI_SUCCESS;
+}
+
+int ze_hmem_close_handle(void *ipc_ptr)
+{
+	ze_result_t ze_ret;
+
+	ze_ret = zeMemCloseIpcHandle(context, ipc_ptr);
+	if (ze_ret) {
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Unable to close memory handle\n");
+		return -FI_EINVAL;
+	}
+
+	return FI_SUCCESS;
+}
+
+#else
+
+int ze_hmem_init(void)
+{
+	return -FI_ENOSYS;
+}
+
+int ze_hmem_cleanup(void)
+{
+	return -FI_ENOSYS;
+}
+
+int ze_hmem_copy(uint64_t device, void *dst, const void *src, size_t size)
+{
+	return -FI_ENOSYS;
+}
+
+bool ze_is_addr_valid(const void *addr)
+{
+	return false;
+}
+
+int ze_hmem_get_handle(void *dev_buf, void **handle)
+{
+	return -FI_ENOSYS;
+}
+
+int ze_hmem_open_handle(void **handle, uint64_t device, void **ipc_ptr)
+{
+	return -FI_ENOSYS;
+}
+
+int ze_hmem_close_handle(void *ipc_ptr)
+{
+	return -FI_ENOSYS;
+}
+
+#endif /* HAVE_LIBZE */
diff --git a/deps/libfabric/src/indexer.c b/deps/libfabric/src/indexer.c
index d094a0e1a09483c14964bc135c83f0194b95dbcb..51ced09c8e2885168c3d24ae89021f04954bcc31 100644
--- a/deps/libfabric/src/indexer.c
+++ b/deps/libfabric/src/indexer.c
@@ -36,7 +36,7 @@
 #include <errno.h>
 #include <sys/types.h>
 #include <stdlib.h>
-
+#include <assert.h>
 #include <ofi_indexer.h>
 
 /*
@@ -113,6 +113,31 @@ void *ofi_idx_remove(struct indexer *idx, int index)
 	return item;
 }
 
+void *ofi_idx_remove_ordered(struct indexer *idx, int index)
+{
+	struct ofi_idx_entry *entry;
+	void *item;
+	int temp_index;
+	int entry_index = ofi_idx_entry_index(index);
+
+	entry = idx->array[ofi_idx_array_index(index)];
+	item = entry[entry_index].item;
+	entry[entry_index].item = NULL;	
+	if (ofi_idx_free_list_empty(idx) || index < idx->free_list) {
+		entry[entry_index].next = idx->free_list;
+		idx->free_list = index;
+		return item;
+	}
+	temp_index = idx->free_list;
+	while (entry[ofi_idx_entry_index(temp_index)].next < index) {
+		temp_index = entry[ofi_idx_entry_index(temp_index)].next;
+	}
+	entry[entry_index].next = entry[ofi_idx_entry_index(temp_index)].next;
+	entry[ofi_idx_entry_index(temp_index)].next = index;
+
+	return item;
+}
+
 void ofi_idx_replace(struct indexer *idx, int index, void *item)
 {
 	struct ofi_idx_entry *entry;
diff --git a/deps/libfabric/src/tree.c b/deps/libfabric/src/tree.c
index 087826c939f307169829725b2f2b1f70d218f62b..7419bd744d92447aaa0e9f7d066137281f513de0 100644
--- a/deps/libfabric/src/tree.c
+++ b/deps/libfabric/src/tree.c
@@ -104,20 +104,20 @@ static void ofi_delete_tree(struct ofi_rbmap *map, struct ofi_rbnode *node)
 }
 
 void ofi_rbmap_cleanup(struct ofi_rbmap *map)
-{
-	ofi_delete_tree(map, map->root);
-}
-
-void ofi_rbmap_destroy(struct ofi_rbmap *map)
 {
 	struct ofi_rbnode *node;
 
-	ofi_rbmap_cleanup(map);
+	ofi_delete_tree(map, map->root);
 	while (map->free_list) {
 		node = map->free_list;
 		map->free_list = node->right;
 		free(node);
 	}
+}
+
+void ofi_rbmap_destroy(struct ofi_rbmap *map)
+{
+	ofi_rbmap_cleanup(map);
 	free(map);
 }
 
@@ -229,8 +229,11 @@ int ofi_rbmap_insert(struct ofi_rbmap *map, void *key, void *data,
 
 	while (current != &map->sentinel) {
 		ret = map->compare(map, key, current->data);
-		if (ret == 0)
+		if (ret == 0) {
+			if (ret_node)
+				*ret_node = current;
 			return -FI_EALREADY;
+		}
 
 		parent = current;
 		current = (ret < 0) ? current->left : current->right;
@@ -378,6 +381,13 @@ void ofi_rbmap_delete(struct ofi_rbmap *map, struct ofi_rbnode *node)
 	ofi_rbnode_free(map, node);
 }
 
+struct ofi_rbnode *ofi_rbmap_get_root(struct ofi_rbmap *map)
+{
+	if (ofi_rbmap_empty(map))
+		return NULL;
+	return map->root;
+}
+
 struct ofi_rbnode *ofi_rbmap_find(struct ofi_rbmap *map, void *key)
 {
 	struct ofi_rbnode *node;
diff --git a/deps/libfabric/src/windows/osd.c b/deps/libfabric/src/windows/osd.c
index 5a05d8f3bfd1b2d906f941ccfd482a33f6a81484..7c0005d4ecc75e8e9b6b52a701a8ff58431f73fd 100644
--- a/deps/libfabric/src/windows/osd.c
+++ b/deps/libfabric/src/windows/osd.c
@@ -509,11 +509,14 @@ ofi_sendv_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt, int flags
 	ssize_t size = 0;
 	int ret, i;
 
-	if (iov_cnt == 1)
-		return send(fd, iovec[0].iov_base, iovec[0].iov_len, flags);
+	if (iov_cnt == 1) {
+		return ofi_send_socket(fd, iovec[0].iov_base,
+				       iovec[0].iov_len, flags);
+	}
 
 	for (i = 0; i < iov_cnt; i++) {
-		ret = send(fd, iovec[i].iov_base, iovec[i].iov_len, flags);
+		ret = ofi_send_socket(fd, iovec[i].iov_base,
+				      iovec[i].iov_len, flags);
 		if (ret >= 0) {
 			size += ret;
 			if (ret != iovec[i].iov_len)
@@ -531,11 +534,14 @@ ofi_recvv_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt, int flags
 	ssize_t size = 0;
 	int ret, i;
 
-	if (iov_cnt == 1)
-		return recv(fd, iovec[0].iov_base, iovec[0].iov_len, flags);
+	if (iov_cnt == 1) {
+		return ofi_recv_socket(fd, iovec[0].iov_base,
+				       iovec[0].iov_len, flags);
+	}
 
 	for (i = 0; i < iov_cnt; i++) {
-		ret = recv(fd, iovec[i].iov_base, iovec[i].iov_len, flags);
+		ret = ofi_recv_socket(fd, iovec[i].iov_base,
+				      iovec[i].iov_len, flags);
 		if (ret >= 0) {
 			size += ret;
 			if (ret != iovec[i].iov_len)
diff --git a/deps/libfabric/util/info.c b/deps/libfabric/util/info.c
index e3c277a46724b1f29153554f6d62f154cc0fd9cc..ed924a528bf3131934e1d268b84f9c6b43b11ac6 100644
--- a/deps/libfabric/util/info.c
+++ b/deps/libfabric/util/info.c
@@ -80,7 +80,7 @@ static const char *help_strings[][2] = {
 	{"FMT", "\t\tspecify accepted address format: FI_FORMAT_UNSPEC, FI_SOCKADDR..."},
 	{"PROV", "\t\tspecify provider explicitly"},
 	{"", "\t\tprint libfabric environment variables"},
-	{"", "\t\tprint libfabric environment variables with substr"},
+	{"SUBSTR", "\t\tprint libfabric environment variables with substr"},
 	{"", "\t\tlist available libfabric providers"},
 	{"", "\t\tverbose output"},
 	{"", "\t\tprint version info and exit"},
@@ -119,6 +119,7 @@ static int str2cap(char *inputstr, uint64_t *value)
 	ORCASE(FI_TAGGED);
 	ORCASE(FI_ATOMIC);
 	ORCASE(FI_MULTICAST);
+	ORCASE(FI_COLLECTIVE);
 
 	ORCASE(FI_READ);
 	ORCASE(FI_WRITE);
diff --git a/deps/libfabric/util/pingpong.c b/deps/libfabric/util/pingpong.c
index 02e4f2cbad09339eedcd056b8d15ac4671b7fe86..ca419be26a67549640e7c56e8946b2c9cf007b6d 100644
--- a/deps/libfabric/util/pingpong.c
+++ b/deps/libfabric/util/pingpong.c
@@ -525,6 +525,15 @@ static int pp_ctrl_recv(struct ct_pingpong *ct, char *buf, size_t size)
 	return ret;
 }
 
+static int pp_ctrl_recv_str(struct ct_pingpong *ct, char *buf, size_t size)
+{
+	int ret;
+
+	ret = pp_ctrl_recv(ct, buf, size);
+	buf[size - 1] = '\0';
+	return ret;
+}
+
 static int pp_send_name(struct ct_pingpong *ct, struct fid *endpoint)
 {
 	size_t addrlen = 0;
@@ -654,12 +663,11 @@ static int pp_ctrl_sync(struct ct_pingpong *ct)
 		}
 		PP_DEBUG("CLIENT: syncing now\n");
 
-		ret = pp_ctrl_recv(ct, ct->ctrl_buf, sizeof(PP_MSG_SYNC_A));
+		ret = pp_ctrl_recv_str(ct, ct->ctrl_buf, sizeof(PP_MSG_SYNC_A));
 		PP_DEBUG("CLIENT: after recv / ret=%d\n", ret);
 		if (ret < 0)
 			return ret;
 		if (strcmp(ct->ctrl_buf, PP_MSG_SYNC_A)) {
-			ct->ctrl_buf[PP_CTRL_BUF_LEN] = '\0';
 			PP_DEBUG("CLIENT: sync error while acking A: <%s> "
 				 "(len=%zu)\n",
 				 ct->ctrl_buf, strlen(ct->ctrl_buf));
@@ -668,12 +676,11 @@ static int pp_ctrl_sync(struct ct_pingpong *ct)
 		PP_DEBUG("CLIENT: synced\n");
 	} else {
 		PP_DEBUG("SERVER: syncing\n");
-		ret = pp_ctrl_recv(ct, ct->ctrl_buf, sizeof(PP_MSG_SYNC_Q));
+		ret = pp_ctrl_recv_str(ct, ct->ctrl_buf, sizeof(PP_MSG_SYNC_Q));
 		PP_DEBUG("SERVER: after recv / ret=%d\n", ret);
 		if (ret < 0)
 			return ret;
 		if (strcmp(ct->ctrl_buf, PP_MSG_SYNC_Q)) {
-			ct->ctrl_buf[PP_CTRL_BUF_LEN] = '\0';
 			PP_DEBUG("SERVER: sync error while acking Q: <%s> "
 				 "(len=%zu)\n",
 				 ct->ctrl_buf, strlen(ct->ctrl_buf));
@@ -724,8 +731,8 @@ static int pp_ctrl_txrx_msg_count(struct ct_pingpong *ct)
 		}
 		PP_DEBUG("CLIENT: sent count\n");
 
-		ret =
-		    pp_ctrl_recv(ct, ct->ctrl_buf, sizeof(PP_MSG_CHECK_CNT_OK));
+		ret = pp_ctrl_recv_str(ct, ct->ctrl_buf,
+				       sizeof(PP_MSG_CHECK_CNT_OK));
 		if (ret < 0)
 			return ret;
 		if (ret < sizeof(PP_MSG_CHECK_CNT_OK)) {
@@ -1829,7 +1836,7 @@ static void pp_free_res(struct ct_pingpong *ct)
 
 	free(ct->rem_name);
 	free(ct->local_name);
-	
+
 	if (ct->buf) {
 		ofi_freealign(ct->buf);
 		ct->buf = ct->rx_buf = ct->tx_buf = NULL;